コード例 #1
0
ファイル: resc_def_all.c プロジェクト: msbritt/torque
int set_proc_ct(

  resource      *pprocsp,  /* I */
  pbs_attribute *pattr,    /* I */
  int            actmode)  /* I */

  {
  resource *pnodesp;
  resource_def *pndef;
  resource *ppct;
  resource_def *ppdef;

  if (actmode == ATR_ACTION_RECOV)
    {
    /* SUCCESS */

    return(0);
    }

  /* set "procct" to count of processors in "nodes" plus "procs" */

  ppdef = find_resc_def(svr_resc_def, "procct", svr_resc_size);

  if (ppdef == NULL)
    {
    return(PBSE_SYSTEM);
    }

  if ((ppct = find_resc_entry(pattr, ppdef)) == NULL)
    {
    if ((ppct = add_resource_entry(pattr, ppdef)) == 0)
      {
      return(PBSE_SYSTEM);
      }
    }

  pndef = find_resc_def(svr_resc_def, "nodes", svr_resc_size);
  if (pndef == NULL)
    {
    return(PBSE_SYSTEM);
    }

  if ((pnodesp = find_resc_entry(pattr, pndef)) == NULL)
    {
    ppct->rs_value.at_val.at_long =
      pprocsp->rs_value.at_val.at_long;
    }
  else
    {
    ppct->rs_value.at_val.at_long =
      pprocsp->rs_value.at_val.at_long;

    count_proc(pnodesp->rs_value.at_val.at_str);
    }

  ppct->rs_value.at_flags |= ATR_VFLAG_SET;

  return(0);
  }  /* END set_proc_ct() */
コード例 #2
0
ファイル: job_route.c プロジェクト: spuder/torque
int remove_procct(job *pjob)
{
    char id[] = "remove_procct";
    pbs_attribute    *pattr;
    resource_def *pctdef;
    resource     *pctresc;

    pattr = &pjob->ji_wattr[JOB_ATR_resource];
    if(pattr == NULL)
    {
        /* Something is really wrong. ji_wattr[JOB_ATR_resource] should always be set
           by the time this function is called */
        sprintf(log_buffer, "%s: Resource_List is NULL. Cannot proceed", id);
        log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buffer);
        pbs_errno = PBSE_INTERNAL;
        return(ROUTE_PERM_FAILURE);
    }

    /* unset the procct resource if it has been set */
    pctdef = find_resc_def(svr_resc_def, "procct", svr_resc_size);

    if ((pctresc = find_resc_entry(pattr, pctdef)) != NULL)
        pctdef->rs_free(&pctresc->rs_value);

    return(PBSE_NONE);
} /* END remove_procct */
コード例 #3
0
ファイル: svr_resccost.c プロジェクト: AlbertDeFusco/torque
int decode_rcost(

  pbs_attribute *patr,
  const char   *name,  /* pbs_attribute name */
  const char *rescn, /* resource name, unused here */
  const char    *val,   /* pbs_attribute value */
  int            perm)  /* used only with resources */

  {
  resource_def *prdef;

  struct resource_cost *pcost;
  void free_rcost(pbs_attribute *);

  if ((val == NULL) || (rescn == NULL))
    {
    patr->at_flags = (patr->at_flags & ~ATR_VFLAG_SET) | ATR_VFLAG_MODIFY;

    return(0);
    }

  if (patr->at_flags & ATR_VFLAG_SET)
    {
    free_rcost(patr);
    }

  prdef = find_resc_def(svr_resc_def, rescn, svr_resc_size);

  if (prdef == NULL)
    {
    return(PBSE_UNKRESC);
    }

  pcost = (struct resource_cost *)GET_NEXT(patr->at_val.at_list);

  while (pcost != NULL)
    {
    if (pcost->rc_def == prdef)
      break; /* have entry in attr already */

    pcost = (struct resource_cost *)GET_NEXT(pcost->rc_link);
    }

  if (pcost == NULL)
    {
    /* add entry */

    if ((pcost = add_cost_entry(patr, prdef)) == NULL)
      {
      return(PBSE_SYSTEM);
      }
    }

  pcost->rc_cost = atol(val);

  patr->at_flags |= ATR_VFLAG_SET | ATR_VFLAG_MODIFY;

  return(0);
  }
コード例 #4
0
ファイル: mom_req_quejob.c プロジェクト: msbritt/torque
void mom_req_quejob(

  batch_request *preq) /* ptr to the decoded request   */

  {
  char           basename[PBS_JOBBASE + 1];
  int            created_here = 0;
  int            index;
  char          *jid;
  attribute_def *pdef;
  job           *pj;
  svrattrl      *psatl;
  int            rc;
  int            sock = preq->rq_conn;

  int            IsCheckpoint = 0;
  /* set basic (user) level access permission */
  int            resc_access_perm = ATR_DFLAG_USWR | ATR_DFLAG_Creat;

  memset(basename, 0, sizeof(basename));

  if (PBSNodeCheckProlog)
    {
    check_state(1);

    if (internal_state & INUSE_DOWN)
      {
      req_reject(PBSE_BADMOMSTATE, 0, preq, NULL, NULL);

      return;
      }
    }

  if (reject_job_submit == TRUE)
    {
    req_reject(-1, 0, preq, NULL, "This mom is configured not to run jobs");
    return;
    }

  if (preq->rq_fromsvr)
    {
    /* from another server - accept the extra attributes */

    resc_access_perm |= ATR_DFLAG_MGWR | ATR_DFLAG_SvWR | ATR_DFLAG_MOM;

    jid = preq->rq_ind.rq_queuejob.rq_jid;
    }
  else
    {
    /* request must be from server */

    log_err(errno, __func__, (char *)"request not from server");

    req_reject(PBSE_IVALREQ, 0, preq, NULL, "request not received from server");

    return;
    }

  /* does job already exist, check both old and new jobs */

  if ((pj = mom_find_job(jid)) == NULL)
    {
    pj = (job *)GET_NEXT(svr_newjobs);

    while (pj != NULL)
      {
      if (!strcmp(pj->ji_qs.ji_jobid, jid))
        break;

      pj = (job *)GET_NEXT(pj->ji_alljobs);
      }
    }

  /*
   * New job ...
   *
   * for MOM - rather than make up a hashname, we use the name sent
   * to us by the server as an pbs_attribute.
   */

  psatl = (svrattrl *)GET_NEXT(preq->rq_ind.rq_queuejob.rq_attr);

  while (psatl != NULL)
    {
    if (!strcmp(psatl->al_name,ATTR_hashname))
      {
      snprintf(basename, sizeof(basename), "%s", psatl->al_value);

      break;
      }

    psatl = (svrattrl *)GET_NEXT(psatl->al_link);
    }

  if (basename[0] == '\0')
    snprintf(basename, sizeof(basename), "%s", jid);

  if (pj != NULL)
    {
    /* newly queued job already exists */

    if (pj->ji_qs.ji_substate == JOB_SUBSTATE_RUNNING)
      {
      /* FAILURE - job exists and is running */

      log_err(errno, __func__, (char *)"cannot queue new job, job exists and is running");

      req_reject(PBSE_JOBEXIST, 0, preq, NULL, "job is running");

      return;
      }

    /* if checkpointed, then keep old and skip rest of process */

    if (pj->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE)
      {
      IsCheckpoint = 1;
      }  /* END if (pj->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) */
    else
      {
      /* reject the job. It is already working here. */
      sprintf(log_buffer, "Job already exists. State: %d substate: %d", pj->ji_qs.ji_state, pj->ji_qs.ji_substate);
      log_err(-1, __func__, log_buffer);
      sprintf(log_buffer, "Job %s already on mom", pj->ji_qs.ji_jobid);
      req_reject(PBSE_JOBEXIST, 0, preq, NULL, log_buffer);
      return;
      }
    }  /* END if (pj != NULL) */
  else
    {
    /* if not already here, allocate job struct */

    if ((pj = job_alloc()) == NULL)
      {
      /* FAILURE */

      req_reject(PBSE_MEM_MALLOC, 0, preq, NULL, "cannot allocate new job structure");

      return;
      }
    }    /* END else (pj != NULL) */

  if (IsCheckpoint == 0)
    {
    strcpy(pj->ji_qs.ji_jobid,jid);

    strcpy(pj->ji_qs.ji_fileprefix,basename);

    pj->ji_modified       = 1;

    pj->ji_qs.ji_svrflags = created_here;

    pj->ji_qs.ji_un_type  = JOB_UNION_TYPE_NEW;

    /* changing the union type overwrites the euid for the job, and if
     * ji_grpcache is set this potentially allows jobs to run as root. Unsetting
     * ji_grpcache fixes this problem --dbeer */
    if (pj->ji_grpcache != NULL)
      {
      free(pj->ji_grpcache);
      pj->ji_grpcache = NULL;
      }
    }

  /* decode attributes from request into job structure */

  psatl = (svrattrl *)GET_NEXT(preq->rq_ind.rq_queuejob.rq_attr);

  while (psatl != NULL)
    {
    if (IsCheckpoint == 1)
      {
      if (strcmp(psatl->al_name,ATTR_checkpoint_name) &&
          strcmp(psatl->al_name,ATTR_v))
        {
        psatl = (svrattrl *)GET_NEXT(psatl->al_link);

        continue;
        }
      }

    /* identify the pbs_attribute by name */

    index = find_attr(job_attr_def,psatl->al_name,JOB_ATR_LAST);

    if (index < 0)
      {
      /* FAILURE */

      /* didn`t recognize the name */

      mom_job_purge(pj);   /* CRI - 12/20/2004 */

      reply_badattr(PBSE_NOATTR, 1, psatl, preq);

      return;
      }

    pdef = &job_attr_def[index];

    /* Is pbs_attribute not writeable by manager or by a server? */

    if ((pdef->at_flags & resc_access_perm) == 0)
      {
      /* FAILURE */

      mom_job_purge(pj);

      reply_badattr(PBSE_ATTRRO, 1, psatl, preq);

      return;
      }

    /* decode pbs_attribute */

    if (!strcmp(psatl->al_name,ATTR_v))
      {
      rc = decode_arst_merge(
             &pj->ji_wattr[index],
             psatl->al_name,
             psatl->al_resc,
             psatl->al_value);
      }
    else
      {
      rc = pdef->at_decode(
             &pj->ji_wattr[index],
             psatl->al_name,
             psatl->al_resc,
             psatl->al_value,
             resc_access_perm);
      }

    if (rc != 0)
      {
      /* FAILURE */

      /* all errors are fatal for MOM */

      mom_job_purge(pj);

      reply_badattr(rc, 1, psatl, preq);

      return;
      }

    if (psatl->al_op == DFLT)
      {
      if (psatl->al_resc)
        {
        resource     *presc;
        resource_def *prdef;

        prdef = find_resc_def(svr_resc_def,psatl->al_resc,svr_resc_size);

        if (prdef == NULL)
          {
          mom_job_purge(pj);

          reply_badattr(rc, 1, psatl, preq);

          return;
          }

        presc = find_resc_entry(&pj->ji_wattr[index],prdef);

        if (presc != NULL)
          presc->rs_value.at_flags |= ATR_VFLAG_DEFLT;
        }
      else
        {
        pj->ji_wattr[index].at_flags |= ATR_VFLAG_DEFLT;
        }
      }    /* END if (psatl->al_op == DFLT) */

    psatl = (svrattrl *)GET_NEXT(psatl->al_link);
    }      /* END while (psatl != NULL) */

  if (IsCheckpoint == 1)
    {
    pj->ji_qs.ji_substate = JOB_SUBSTATE_TRANSIN;

    if (reply_jobid(preq,pj->ji_qs.ji_jobid,BATCH_REPLY_CHOICE_Queue) == 0)
      {
      remove_from_job_list(pj);

      append_link(&svr_newjobs,&pj->ji_alljobs,pj);

      if (pj->ji_grpcache != NULL)
        {
        free(pj->ji_grpcache);
        pj->ji_grpcache = NULL;
        }

      pj->ji_qs.ji_un_type = JOB_UNION_TYPE_NEW;
      pj->ji_qs.ji_un.ji_newt.ji_fromsock = sock;
      pj->ji_qs.ji_un.ji_newt.ji_fromaddr = get_connectaddr(sock,FALSE);
      pj->ji_qs.ji_un.ji_newt.ji_scriptsz = 0;

      /* Per Eric R., req_mvjobfile was giving error in open_std_file,
         showed up as fishy error message */

      if (pj->ji_grpcache != NULL)
        {
        free(pj->ji_grpcache);
        pj->ji_grpcache = NULL;
        }
      }
    else
      {
      close_conn(sock, FALSE);
      }

    /* SUCCESS */

    return;
    }

  /* set remaining job structure elements */
  pj->ji_qs.ji_state =    JOB_STATE_TRANSIT;
  pj->ji_qs.ji_substate = JOB_SUBSTATE_TRANSIN;

  pj->ji_wattr[JOB_ATR_mtime].at_val.at_long = (long)time_now;
  pj->ji_wattr[JOB_ATR_mtime].at_flags |= ATR_VFLAG_SET;

  if (pj->ji_grpcache != NULL)
    {
    free(pj->ji_grpcache);
    pj->ji_grpcache = NULL;
    }

  pj->ji_qs.ji_un_type = JOB_UNION_TYPE_NEW;
  pj->ji_qs.ji_un.ji_newt.ji_fromsock = sock;
  pj->ji_qs.ji_un.ji_newt.ji_fromaddr = get_connectaddr(sock,FALSE);
  pj->ji_qs.ji_un.ji_newt.ji_scriptsz = 0;

  /* acknowledge the request with the job id */
  if (reply_jobid(preq, pj->ji_qs.ji_jobid, BATCH_REPLY_CHOICE_Queue) != 0)
    {
    /* reply failed, purge the job and close the connection */
    // call mom_job_purge first so that double-frees don't happen 
    // when the on_close function is called
    mom_job_purge(pj);

    close_conn(sock, FALSE);

    return;
    }

  /* link job into server's new jobs list request  */
  append_link(&svr_newjobs, &pj->ji_alljobs, pj);

  return;
  }  /* END mom_req_quejob() */
コード例 #5
0
ファイル: prolog.c プロジェクト: ansonl/torque
void setup_pelog_environment(
    
  job *pjob,
  int  which)

  {
  char      buf[MAXPATHLEN + 1024];
  resource *r;
  /*
   * Pass Resource_List.nodes request in environment
   * to allow pro/epi-logue setup/teardown of system
   * settings.  --pw, 2 Jan 02
   * Fixed to use putenv for sysV compatibility.
   *  --troy, 11 jun 03
   *
   */

  r = find_resc_entry(
        &pjob->ji_wattr[JOB_ATR_resource],
        find_resc_def(svr_resc_def, (char *)"nodes", svr_resc_size));

  if (r != NULL)
    {
    const char *ppn_str = "ppn=";
    int         num_nodes = 1;
    int         num_ppn = 1;

    /* PBS_RESOURCE_NODES */
    put_env_var("PBS_RESOURCE_NODES", r->rs_value.at_val.at_str);

    /* PBS_NUM_NODES */
    num_nodes = strtol(r->rs_value.at_val.at_str, NULL, 10);

    /* 
     * InitUserEnv() also calculates num_nodes and num_ppn the same way
     */
    if (num_nodes != 0)
      {
      char *tmp;
      char *other_reqs;

      /* get the ppn */
      if ((tmp = strstr(r->rs_value.at_val.at_str,ppn_str)) != NULL)
        {
        tmp += strlen(ppn_str);

        num_ppn = strtol(tmp, NULL, 10);
        }

      other_reqs = r->rs_value.at_val.at_str;

      while ((other_reqs = strchr(other_reqs, '+')) != NULL)
        {
        other_reqs += 1;
        num_nodes += strtol(other_reqs, &other_reqs, 10);
        }
      }

    sprintf(buf, "%d", num_nodes);
    put_env_var("PBS_NUM_NODES", buf);

    /* PBS_NUM_PPN */
    sprintf(buf, "%d", num_ppn);
    put_env_var("PBS_NUM_PPN", buf);

    /* PBS_NP */
    sprintf(buf, "%d", pjob->ji_numvnod);
    put_env_var("PBS_NP", buf);
    }  /* END if (r != NULL) */

  r = find_resc_entry(
        &pjob->ji_wattr[JOB_ATR_resource],
        find_resc_def(svr_resc_def, (char *)"gres", svr_resc_size));

  if (r != NULL)
    {
    /* setenv("PBS_RESOURCE_NODES",r->rs_value.at_val.at_str,1); */
    put_env_var("PBS_RESOURCE_GRES", r->rs_value.at_val.at_str);
    }

  char *cpu_clock = arst_string("PBS_CPUCLOCK",&pjob->ji_wattr[JOB_ATR_variables]);
  if (cpu_clock != NULL)
    {
    cpu_clock = strchr(cpu_clock,'=');
    if (cpu_clock != NULL)
      {
      cpu_clock++;
      put_env_var("PBS_CPUCLOCK",cpu_clock);
      }
    }

  if (TTmpDirName(pjob, buf, sizeof(buf)))
    {
    put_env_var("TMPDIR", buf);
    }

  /* Set PBS_SCHED_HINT */
  char *envname = (char *)"PBS_SCHED_HINT";
  char *envval;

  if ((envval = get_job_envvar(pjob, envname)) != NULL)
    {
    put_env_var("PBS_SCHED_HINT", envval);
    }

  /* Set PBS_NODENUM */

  sprintf(buf, "%d", pjob->ji_nodeid);
  put_env_var("PBS_NODENUM", buf);

  /* Set PBS_MSHOST */
  put_env_var("PBS_MSHOST", pjob->ji_vnods[0].vn_host->hn_host);

  /* Set PBS_NODEFILE */
  if (pjob->ji_flags & MOM_HAS_NODEFILE)
    {
    sprintf(buf, "%s/%s",
      path_aux,
      pjob->ji_qs.ji_jobid);
    put_env_var("PBS_NODEFILE", buf);
    }

  /* Set PBS_O_WORKDIR */
  char *workdir_val = get_job_envvar(pjob,"PBS_O_WORKDIR");
  if (workdir_val != NULL)
    {
    put_env_var("PBS_O_WORKDIR", workdir_val);
    }

  /* SET BEOWULF_JOB_MAP */
  struct array_strings *vstrs;

  int VarIsSet = 0;
  int j;

  vstrs = pjob->ji_wattr[JOB_ATR_variables].at_val.at_arst;

  for (j = 0;j < vstrs->as_usedptr;++j)
    {
    if (!strncmp(
          vstrs->as_string[j],
          "BEOWULF_JOB_MAP=",
          strlen("BEOWULF_JOB_MAP=")))
      {
      VarIsSet = 1;

      break;
      }
    }

  if (VarIsSet == 1)
    {
    char *val = strchr(vstrs->as_string[j], '=');

    if (val != NULL)
      put_env_var("BEOWULF_JOB_MAP", val+1);
    }

  /* Set some Moab env variables if they exist */
  if ((which == PE_PROLOG) ||
      (which == PE_EPILOG))
    {
    char *tmp_val;
    int   moabenvcnt = 14;  /* # of entries in moabenvs */
    static char      *moabenvs[] = {
        (char *)"MOAB_NODELIST",
        (char *)"MOAB_JOBID",
        (char *)"MOAB_JOBNAME",
        (char *)"MOAB_USER",
        (char *)"MOAB_GROUP",
        (char *)"MOAB_CLASS",
        (char *)"MOAB_TASKMAP",
        (char *)"MOAB_QOS",
        (char *)"MOAB_PARTITION",
        (char *)"MOAB_PROCCOUNT",
        (char *)"MOAB_NODECOUNT",
        (char *)"MOAB_MACHINE",
        (char *)"MOAB_JOBARRAYINDEX",
        (char *)"MOAB_JOBARRAYRANGE"
        };
    
    for (int aindex=0; aindex < moabenvcnt; aindex++)
      {
      tmp_val = get_job_envvar(pjob,moabenvs[aindex]);
      if (tmp_val != NULL)
        {
        put_env_var(moabenvs[aindex], tmp_val);
        }
      }
    }
  } /* END setup_pelog_environment() */
コード例 #6
0
ファイル: resc_def_all.c プロジェクト: msbritt/torque
int set_mppnodect(

  resource      * UNUSED(res),
  pbs_attribute *attr,
  int             UNUSED(op))

  {
  int           width;
  int           nppn;
  int           nodect;
  int           have_mppwidth = 0;
  int           have_mppnppn = 0;
  resource_def *pdef;
  resource     *pent = NULL;

  /* Go find the currently known width, nppn attributes */

  width = 0;
  nppn = 0;

  if (((pdef = find_resc_def(svr_resc_def,"mppwidth",svr_resc_size))) &&
      ((pent = find_resc_entry(attr,pdef))))
    {
    width = pent->rs_value.at_val.at_long;
    have_mppwidth = 1;
    }

  if (((pdef = find_resc_def(svr_resc_def,"mppnppn",svr_resc_size))) &&
      ((pent = find_resc_entry(attr,pdef))))
    {
    nppn = pent->rs_value.at_val.at_long;
    have_mppnppn = 1;
  
    /* Check for width less than a node */
    if ((width) && (width < nppn))
      {
      nppn = width;
      pent->rs_value.at_val.at_long = nppn;
      pent->rs_value.at_flags |= ATR_VFLAG_SET;
      }
    }

  /* Compute an estimate for the number of nodes needed */

  nodect = width;
  if (nppn > 1)
    {
    nodect = (nodect + nppn - 1) / nppn;
    }

  /* Find or create the "mppnodect" pbs_attribute entry */

  if ((pdef = find_resc_def(svr_resc_def,"mppnodect",svr_resc_size))) 
    {
    if (((pent = find_resc_entry(attr,pdef)) == NULL) &&
        ((pent = add_resource_entry(attr,pdef)) == NULL))
      {
      return(PBSE_SYSTEM);
      }
    }
  else
    return(PBSE_SYSTEM);

  /* Update the value */

  if (!have_mppwidth || !have_mppnppn)
    {
    pent->rs_value.at_val.at_long = -1;
    }
  else
    {
    pent->rs_value.at_val.at_long = nodect;
    }

  pent->rs_value.at_flags |= ATR_VFLAG_SET;

  return(PBSE_NONE);
  } /* END set_mppnodect() */
コード例 #7
0
ファイル: resc_def_all.c プロジェクト: msbritt/torque
int set_node_ct(

  resource      *pnodesp,  /* I */
  pbs_attribute *pattr,    /* I */
  int            actmode)  /* I */

  {
  resource *pnct;
  resource_def *pndef;
  resource *ppct = NULL;
  resource_def *ppdef;
  resource *pprocsp;
  resource_def *pprocsdef;

  if (actmode == ATR_ACTION_RECOV)
    {
    /* SUCCESS */

    return(0);
    }

  /* Set "nodect" to count of nodes in "nodes" */

  pndef = find_resc_def(svr_resc_def, "nodect", svr_resc_size);

  if (pndef == NULL)
    {
    return(PBSE_SYSTEM);
    }

  if ((pnct = find_resc_entry(pattr, pndef)) == NULL)
    {
    if ((pnct = add_resource_entry(pattr, pndef)) == NULL)
      {
      return(PBSE_SYSTEM);
      }
    }

  pnct->rs_value.at_val.at_long = ctnodes(pnodesp->rs_value.at_val.at_str);

  pnct->rs_value.at_flags |= ATR_VFLAG_SET;

  /* Set "neednodes" to "nodes", may be altered by scheduler */

  pndef = find_resc_def(svr_resc_def, "neednodes", svr_resc_size);

  if (pndef == NULL)
    {
    return(PBSE_SYSTEM);
    }

  if ((pnct = find_resc_entry(pattr, pndef)) == NULL)
    {
    if ((pnct = add_resource_entry(pattr, pndef)) == NULL)
      {
      return(PBSE_SYSTEM);
      }
    }
  else
    {
    pndef->rs_free(&pnct->rs_value);
    }

  pndef->rs_decode(&pnct->rs_value, NULL, NULL, pnodesp->rs_value.at_val.at_str, ATR_DFLAG_ACCESS);

  pnct->rs_value.at_flags |= ATR_VFLAG_SET;

  /* SUCCESS nodect */

  /* set "procct" to count of processors in "nodes" plus "procs" */

  ppdef = find_resc_def(svr_resc_def, "procct", svr_resc_size);

  if (ppdef == NULL)
    {
    return(PBSE_SYSTEM);
    }

  if ((ppct = find_resc_entry(pattr, ppdef)) == NULL)
    {
    if ((ppct = add_resource_entry(pattr, ppdef)) == 0)
      {
      return(PBSE_SYSTEM);
      }
    }

  pprocsdef = find_resc_def(svr_resc_def, "procs", svr_resc_size);
  if (pprocsdef == NULL)
    {
    return(PBSE_SYSTEM);
    }

  if ((pprocsp = find_resc_entry(pattr, pprocsdef)) == NULL)
    {
    ppct->rs_value.at_val.at_long = count_proc(pnodesp->rs_value.at_val.at_str);
    }
  else
    { 
    ppct->rs_value.at_val.at_long = pprocsp->rs_value.at_val.at_long;
    ppct->rs_value.at_val.at_long += count_proc(pnodesp->rs_value.at_val.at_str);
    }

  ppct->rs_value.at_flags |= ATR_VFLAG_SET;

  /* SUCCESS procct */

  return(0);
  }  /* END set_node_ct() */
コード例 #8
0
ファイル: prolog.c プロジェクト: j0hnf/torque
int run_pelog(

  int   which,      /* I (one of PE_*) */
  char *specpelog,  /* I - script path */
  job  *pjob,       /* I - associated job */
  int   pe_io_type, /* I - io type */
  int   deletejob)  /* I - called before a job being deleted (purge -p) */

  {
  struct sigaction  act;
  struct sigaction  oldact;
  char             *arg[12];
  int               fds1 = 0;
  int               fds2 = 0;
  int               fd_input;
  char              resc_list[2048];
  char              resc_used[2048];

  struct stat       sbuf;
  char              sid[20];
  char              exit_stat[11];
  int               waitst;
  int               isjoined;  /* boolean */
  char              buf[MAXPATHLEN + 1024];
  char              pelog[MAXPATHLEN + 1024];

  uid_t             real_uid;
  gid_t            *real_gids = NULL;
  gid_t             real_gid;
  int               num_gids;

  int               jobtypespecified = 0;

  resource         *r;

  char             *EmptyString = (char *)"";

  int               LastArg;
  int               aindex;

  int               rc;

  char             *ptr;

  int               moabenvcnt = 14;  /* # of entries in moabenvs */
  static char      *moabenvs[] = {
      (char *)"MOAB_NODELIST",
      (char *)"MOAB_JOBID",
      (char *)"MOAB_JOBNAME",
      (char *)"MOAB_USER",
      (char *)"MOAB_GROUP",
      (char *)"MOAB_CLASS",
      (char *)"MOAB_TASKMAP",
      (char *)"MOAB_QOS",
      (char *)"MOAB_PARTITION",
      (char *)"MOAB_PROCCOUNT",
      (char *)"MOAB_NODECOUNT",
      (char *)"MOAB_MACHINE",
      (char *)"MOAB_JOBARRAYINDEX",
      (char *)"MOAB_JOBARRAYRANGE"
      };

  if ((pjob == NULL) || (specpelog == NULL) || (specpelog[0] == '\0'))
    {
    return(0);
    }

  ptr = pjob->ji_wattr[JOB_ATR_jobtype].at_val.at_str;

  if (ptr != NULL)
    {
    jobtypespecified = 1;

    snprintf(pelog,sizeof(pelog),"%s.%s",
      specpelog,
      ptr);
    }
  else
    {
    snprintf(pelog, sizeof(pelog), "%s", specpelog);
    }
    
  real_uid = getuid();
  real_gid = getgid();
  if ((num_gids = getgroups(0, real_gids)) < 0)
    {
    log_err(errno, __func__, (char *)"getgroups failed\n");
    
    return(-1);
    }

  /* to support root squashing, become the user before performing file checks */
  if ((which == PE_PROLOGUSER) || 
      (which == PE_EPILOGUSER) || 
      (which == PE_PROLOGUSERJOB) || 
      (which == PE_EPILOGUSERJOB))
    {

    real_gids = (gid_t *)calloc(num_gids, sizeof(gid_t));
    
    if (real_gids == NULL)
      {
      log_err(ENOMEM, __func__, (char *)"Cannot allocate memory! FAILURE\n");
      
      return(-1);
      }
    
    if (getgroups(num_gids,real_gids) < 0)
      {
      log_err(errno, __func__, (char *)"getgroups failed\n");
      free(real_gids);
      
      return(-1);
      }
    
    /* pjob->ji_grpcache will not be set if using LDAP and LDAP not set */
    /* It is possible that ji_grpcache failed to allocate as well. 
       Make sure ji_grpcache is not NULL */
    if (pjob->ji_grpcache != NULL)
      {
      if (setgroups(
            pjob->ji_grpcache->gc_ngroup,
            (gid_t *)pjob->ji_grpcache->gc_groups) != 0)
        {
        snprintf(log_buffer,sizeof(log_buffer),
          "setgroups() for UID = %lu failed: %s\n",
          (unsigned long)pjob->ji_qs.ji_un.ji_momt.ji_exuid,
          strerror(errno));
      
        log_err(errno, __func__, log_buffer);
      
        undo_set_euid_egid(which,real_uid,real_gid,num_gids,real_gids,__func__);
        free(real_gids);
      
        return(-1);
        }
      }
    else
      {
      sprintf(log_buffer, "pjob->ji_grpcache is null. check_pwd likely failed.");
      log_err(-1, __func__, log_buffer);
      undo_set_euid_egid(which,real_uid,real_gid,num_gids,real_gids,__func__);
      free(real_gids);
      return(-1);
      }
    
    if (setegid(pjob->ji_qs.ji_un.ji_momt.ji_exgid) != 0)
      {
      snprintf(log_buffer,sizeof(log_buffer),
        "setegid(%lu) for UID = %lu failed: %s\n",
        (unsigned long)pjob->ji_qs.ji_un.ji_momt.ji_exgid,
        (unsigned long)pjob->ji_qs.ji_un.ji_momt.ji_exuid,
        strerror(errno));
      
      log_err(errno, __func__, log_buffer);
      
      undo_set_euid_egid(which,real_uid,real_gid,num_gids,real_gids,__func__);
      free(real_gids);
      
      return(-1);
      }
    
    if (setuid_ext(pjob->ji_qs.ji_un.ji_momt.ji_exuid, TRUE) != 0)
      {
      snprintf(log_buffer,sizeof(log_buffer),
        "seteuid(%lu) failed: %s\n",
        (unsigned long)pjob->ji_qs.ji_un.ji_momt.ji_exuid,
        strerror(errno));
      
      log_err(errno, __func__, log_buffer);
      
      undo_set_euid_egid(which,real_uid,real_gid,num_gids,real_gids,__func__);
      free(real_gids);

      return(-1);
      }
    }

  rc = stat(pelog,&sbuf);

  if ((rc == -1) && (jobtypespecified == 1))
    {
    snprintf(pelog, sizeof(pelog), "%s", specpelog);

    rc = stat(pelog,&sbuf);
    }

  if (rc == -1)
    {
    if (errno == ENOENT || errno == EBADF)
      {
      /* epilog/prolog script does not exist */

      if (LOGLEVEL >= 5)
        {
        static char tmpBuf[1024];

        sprintf(log_buffer, "%s script '%s' for job %s does not exist (cwd: %s,pid: %d)",
          PPEType[which],
          (pelog[0] != '\0') ? pelog : "NULL",
          pjob->ji_qs.ji_jobid,
          getcwd(tmpBuf, sizeof(tmpBuf)),
          getpid());

        log_record(PBSEVENT_SYSTEM, 0, __func__, log_buffer);
        }

#ifdef ENABLE_CSA
      if ((which == PE_EPILOGUSER) && (!strcmp(pelog, path_epiloguser)))
        {
        /*
          * Add a workload management end record
        */
        if (LOGLEVEL >= 8)
          {
          sprintf(log_buffer, "%s calling add_wkm_end from run_pelog() - no user epilog",
            pjob->ji_qs.ji_jobid);

          log_err(-1, __func__, log_buffer);
          }

        add_wkm_end(pjob->ji_wattr[JOB_ATR_pagg_id].at_val.at_ll,
            pjob->ji_qs.ji_un.ji_momt.ji_exitstat, pjob->ji_qs.ji_jobid);
        }

#endif /* ENABLE_CSA */

      undo_set_euid_egid(which,real_uid,real_gid,num_gids,real_gids,__func__);
      free(real_gids);

      return(0);
      }
      
    undo_set_euid_egid(which,real_uid,real_gid,num_gids,real_gids,__func__);
    free(real_gids);

    return(pelog_err(pjob,pelog,errno,(char *)"cannot stat"));
    }

  if (LOGLEVEL >= 5)
    {
    sprintf(log_buffer,"running %s script '%s' for job %s",
      PPEType[which],
      (pelog[0] != '\0') ? pelog : "NULL",
      pjob->ji_qs.ji_jobid);

    log_ext(-1, __func__, log_buffer, LOG_DEBUG);  /* not actually an error--but informational */
    }

  /* script must be owned by root, be regular file, read and execute by user *
   * and not writeable by group or other */

  if (reduceprologchecks == TRUE)
    {
    if ((!S_ISREG(sbuf.st_mode)) ||
        (!(sbuf.st_mode & (S_IXUSR | S_IXGRP | S_IXOTH))))
      {
      undo_set_euid_egid(which,real_uid,real_gid,num_gids,real_gids,__func__);
      free(real_gids);
      return(pelog_err(pjob,pelog,-1, (char *)"permission Error"));
      }
    }
  else
    {
    if (which == PE_PROLOGUSERJOB || which == PE_EPILOGUSERJOB)
      {
      if ((sbuf.st_uid != pjob->ji_qs.ji_un.ji_momt.ji_exuid) || 
          (!S_ISREG(sbuf.st_mode)) ||
          ((sbuf.st_mode & (S_IRUSR | S_IXUSR)) != (S_IRUSR | S_IXUSR)) ||
          (sbuf.st_mode & (S_IWGRP | S_IWOTH)))
        {
        undo_set_euid_egid(which,real_uid,real_gid,num_gids,real_gids,__func__);
        free(real_gids);
        return(pelog_err(pjob,pelog,-1, (char *)"permission Error"));
        }
      }
    else if ((sbuf.st_uid != 0) ||
        (!S_ISREG(sbuf.st_mode)) ||
        ((sbuf.st_mode & (S_IRUSR | S_IXUSR)) != (S_IRUSR | S_IXUSR)) ||\
        (sbuf.st_mode & (S_IWGRP | S_IWOTH)))
      {
      undo_set_euid_egid(which,real_uid,real_gid,num_gids,real_gids,__func__);
      free(real_gids);
      return(pelog_err(pjob,pelog,-1, (char *)"permission Error"));
      }
    
    if ((which == PE_PROLOGUSER) || (which == PE_EPILOGUSER))
      {
      /* script must also be read and execute by other */
      
      if ((sbuf.st_mode & (S_IROTH | S_IXOTH)) != (S_IROTH | S_IXOTH))
        {
        undo_set_euid_egid(which,real_uid,real_gid,num_gids,real_gids,__func__);
        free(real_gids);
        return(pelog_err(pjob, pelog, -1,  (char *)"permission Error"));
        }
      }
    } /* END !reduceprologchecks */

  fd_input = pe_input(pjob->ji_qs.ji_jobid);

  if (fd_input < 0)
    {
    undo_set_euid_egid(which,real_uid,real_gid,num_gids,real_gids,__func__);
    free(real_gids);
    return(pelog_err(pjob, pelog, -2,  (char *)"no pro/epilogue input file"));
    }

  run_exit = 0;

  child = fork();

  if (child > 0)
    {
    int KillSent = FALSE;

    /* parent - watch for prolog/epilog to complete */

    close(fd_input);

    /* switch back to root if necessary */
    undo_set_euid_egid(which,real_uid,real_gid,num_gids,real_gids,__func__);
    free(real_gids);

    act.sa_handler = pelogalm;

    sigemptyset(&act.sa_mask);

    act.sa_flags = 0;

    sigaction(SIGALRM, &act, &oldact);

    /* it would be nice if the harvest routine could block for 5 seconds,
       and if the prolog is not complete in that time, mark job as prolog
       pending, append prolog child, and continue */

    /* main loop should attempt to harvest prolog in non-blocking mode.
       If unsuccessful after timeout, job should be terminated, and failure
       reported.  If successful, mom should unset prolog pending, and
       continue with job start sequence.  Mom should report job as running
       while prologpending flag is set.  (NOTE:  must track per job prolog
       start time)
    */

    alarm(pe_alarm_time);

    while (waitpid(child, &waitst, 0) < 0)
      {
      if (errno != EINTR)
        {
        /* exit loop. non-alarm based failure occurred */

        run_exit = -3;

        MOMPrologFailureCount++;

        break;
        }

      if (run_exit == -4)
        {
        if (KillSent == FALSE)
          {
          MOMPrologTimeoutCount++;

          /* timeout occurred */

          KillSent = TRUE;

          /* NOTE:  prolog/epilog may be locked in KERNEL space and unkillable */

          alarm(5);
          }
        else
          {
          /* cannot kill prolog/epilog, give up */

          run_exit = -5;

          break;
          }
        }
      }    /* END while (wait(&waitst) < 0) */

    /* epilog/prolog child completed */
#ifdef ENABLE_CSA
    if ((which == PE_EPILOGUSER) && (!strcmp(pelog, path_epiloguser)))
      {
      /*
       * Add a workload management end record
      */
      if (LOGLEVEL >= 8)
        {
        sprintf(log_buffer, "%s calling add_wkm_end from run_pelog() - after user epilog",
                pjob->ji_qs.ji_jobid);

        log_err(-1, __func__, log_buffer);
        }

      add_wkm_end(pjob->ji_wattr[JOB_ATR_pagg_id].at_val.at_ll,
          pjob->ji_qs.ji_un.ji_momt.ji_exitstat, pjob->ji_qs.ji_jobid);
      }

#endif /* ENABLE_CSA */

    alarm(0);

    /* restore the previous handler */

    sigaction(SIGALRM, &oldact, 0);

    if (run_exit == 0)
      {
      if (WIFEXITED(waitst))
        {
        run_exit = WEXITSTATUS(waitst);
        }
      }
    }
  else
    {
    /* child - run script */

    log_close(0);

    if (lockfds >= 0)
      {
      close(lockfds);

      lockfds = -1;
      }

    net_close(-1);

    if (fd_input != 0)
      {
      close(0);

      if (dup(fd_input) == -1) {}

      close(fd_input);
      }

    if (pe_io_type == PE_IO_TYPE_NULL)
      {
      /* no output, force to /dev/null */

      fds1 = open("/dev/null", O_WRONLY, 0600);
      fds2 = open("/dev/null", O_WRONLY, 0600);
      }
    else if (pe_io_type == PE_IO_TYPE_STD)
      {
      /* open job standard out/error */

      /*
       * We need to know if files are joined or not.
       * If they are then open the correct file and duplicate it to the other
      */

      isjoined = is_joined(pjob);

      switch (isjoined)
        {
        case -1:

          fds2 = open_std_file(pjob, StdErr, O_WRONLY | O_APPEND,
                               pjob->ji_qs.ji_un.ji_momt.ji_exgid);

          fds1 = (fds2 < 0)?-1:dup(fds2);

          break;

        case 1:

          fds1 = open_std_file(pjob, StdOut, O_WRONLY | O_APPEND,
                               pjob->ji_qs.ji_un.ji_momt.ji_exgid);

          fds2 = (fds1 < 0)?-1:dup(fds1);

          break;

        default:

          fds1 = open_std_file(pjob, StdOut, O_WRONLY | O_APPEND,
                               pjob->ji_qs.ji_un.ji_momt.ji_exgid);

          fds2 = open_std_file(pjob, StdErr, O_WRONLY | O_APPEND,
                               pjob->ji_qs.ji_un.ji_momt.ji_exgid);
          break;
        }
      }

    if (!deletejob)
      if ((fds1 < 0) ||
          (fds2 < 0))
        {
        if (fds1 >= 0)
          close(fds1);
        if (fds2 >= 0)
          close(fds2);

        exit(-1);
        }

    if (pe_io_type != PE_IO_TYPE_ASIS)
      {
      /* If PE_IO_TYPE_ASIS, leave as is, already open to job */

      if (fds1 != 1)
        {
        close(1);

        if (dup(fds1) >= 0)
          {
          close(fds1);
          }
        }

      if (fds2 != 2)
        {
        close(2);

        if (dup(fds2) >= 0)
          {
          close(fds2);
          }
        }
      }

    if ((which == PE_PROLOGUSER) || 
        (which == PE_EPILOGUSER) || 
        (which == PE_PROLOGUSERJOB) || 
        (which == PE_EPILOGUSERJOB))
      {
      if (chdir(pjob->ji_grpcache->gc_homedir) != 0)
        {
        /* warn only, no failure */

        sprintf(log_buffer,
          "PBS: chdir to %s failed: %s (running user %s in current directory)",
          pjob->ji_grpcache->gc_homedir,
          strerror(errno),
          which == PE_PROLOGUSER ? "prologue" : "epilogue");

        if (write_ac_socket(2, log_buffer, strlen(log_buffer)) == -1) {}

        fsync(2);
        }
      }

    /* for both prolog and epilog */

    if (DEBUGMODE == 1)
      {
      fprintf(stderr, "PELOGINFO:  script:'%s'  jobid:'%s'  euser:'******'  egroup:'%s'  jobname:'%s' SSID:'%ld'  RESC:'%s'\n",
              pelog,
              pjob->ji_qs.ji_jobid,
              pjob->ji_wattr[JOB_ATR_euser].at_val.at_str,
              pjob->ji_wattr[JOB_ATR_egroup].at_val.at_str,
              pjob->ji_wattr[JOB_ATR_jobname].at_val.at_str,
              pjob->ji_wattr[JOB_ATR_session_id].at_val.at_long,
              resc_to_string(pjob, JOB_ATR_resource, resc_list, sizeof(resc_list)));
      }

    arg[0] = pelog;

    arg[1] = pjob->ji_qs.ji_jobid;
    arg[2] = pjob->ji_wattr[JOB_ATR_euser].at_val.at_str;
    arg[3] = pjob->ji_wattr[JOB_ATR_egroup].at_val.at_str;
    arg[4] = pjob->ji_wattr[JOB_ATR_jobname].at_val.at_str;

    /* NOTE:  inside child */

    if ((which == PE_EPILOG) || 
        (which == PE_EPILOGUSER) || 
        (which == PE_EPILOGUSERJOB))
      {
      /* for epilog only */

      sprintf(sid, "%ld",
              pjob->ji_wattr[JOB_ATR_session_id].at_val.at_long);
      sprintf(exit_stat,"%d",
              pjob->ji_qs.ji_un.ji_momt.ji_exitstat);

      arg[5] = sid;
      arg[6] = resc_to_string(pjob, JOB_ATR_resource, resc_list, sizeof(resc_list));
      arg[7] = resc_to_string(pjob, JOB_ATR_resc_used, resc_used, sizeof(resc_used));
      arg[8] = pjob->ji_wattr[JOB_ATR_in_queue].at_val.at_str;
      arg[9] = pjob->ji_wattr[JOB_ATR_account].at_val.at_str;
      arg[10] = exit_stat;
      arg[11] = NULL;

      LastArg = 11;
      }
    else
      {
      /* prolog */

      arg[5] = resc_to_string(pjob, JOB_ATR_resource, resc_list, sizeof(resc_list));
      arg[6] = pjob->ji_wattr[JOB_ATR_in_queue].at_val.at_str;
      arg[7] = pjob->ji_wattr[JOB_ATR_account].at_val.at_str;
      arg[8] = NULL;

      LastArg = 8;
      }

    for (aindex = 0;aindex < LastArg;aindex++)
      {
      if (arg[aindex] == NULL)
        arg[aindex] = EmptyString;
      }  /* END for (aindex) */

    /*
     * Pass Resource_List.nodes request in environment
     * to allow pro/epi-logue setup/teardown of system
     * settings.  --pw, 2 Jan 02
     * Fixed to use putenv for sysV compatibility.
     *  --troy, 11 jun 03
     *
     */

    r = find_resc_entry(
          &pjob->ji_wattr[JOB_ATR_resource],
          find_resc_def(svr_resc_def, (char *)"nodes", svr_resc_size));

    if (r != NULL)
      {
      /* setenv("PBS_RESOURCE_NODES",r->rs_value.at_val.at_str,1); */

      const char *ppn_str = "ppn=";
      int num_nodes = 1;
      int num_ppn = 1;

      /* PBS_RESOURCE_NODES */
      put_env_var("PBS_RESOURCE_NODES", r->rs_value.at_val.at_str);

      /* PBS_NUM_NODES */
      num_nodes = strtol(r->rs_value.at_val.at_str, NULL, 10);

      /* 
       * InitUserEnv() also calculates num_nodes and num_ppn the same way
       */
      if (num_nodes != 0)
        {
        char *tmp;
        char *other_reqs;

        /* get the ppn */
        if ((tmp = strstr(r->rs_value.at_val.at_str,ppn_str)) != NULL)
          {
          tmp += strlen(ppn_str);

          num_ppn = strtol(tmp, NULL, 10);
          }

        other_reqs = r->rs_value.at_val.at_str;

        while ((other_reqs = strchr(other_reqs, '+')) != NULL)
          {
          other_reqs += 1;
          num_nodes += strtol(other_reqs, &other_reqs, 10);
          }
        }

      sprintf(buf, "%d", num_nodes);
      put_env_var("PBS_NUM_NODES", buf);

      /* PBS_NUM_PPN */
      sprintf(buf, "%d", num_ppn);
      put_env_var("PBS_NUM_PPN", buf);

      /* PBS_NP */
      sprintf(buf, "%d", pjob->ji_numvnod);
      put_env_var("PBS_NP", buf);
      }  /* END if (r != NULL) */

    r = find_resc_entry(
          &pjob->ji_wattr[JOB_ATR_resource],
          find_resc_def(svr_resc_def, (char *)"gres", svr_resc_size));

    if (r != NULL)
      {
      /* setenv("PBS_RESOURCE_NODES",r->rs_value.at_val.at_str,1); */
      put_env_var("PBS_RESOURCE_GRES", r->rs_value.at_val.at_str);
      }

    if (TTmpDirName(pjob, buf, sizeof(buf)))
      {
      put_env_var("TMPDIR", buf);
      }

    /* Set PBS_SCHED_HINT */

    {
    char *envname = (char *)"PBS_SCHED_HINT";
    char *envval;

    if ((envval = get_job_envvar(pjob, envname)) != NULL)
      {
      put_env_var("PBS_SCHED_HINT", envval);
      }
    }

    /* Set PBS_NODENUM */

    sprintf(buf, "%d",
      pjob->ji_nodeid);
    put_env_var("PBS_NODENUM", buf);

    /* Set PBS_MSHOST */

    put_env_var("PBS_MSHOST", pjob->ji_vnods[0].vn_host->hn_host);

    /* Set PBS_NODEFILE */

    if (pjob->ji_flags & MOM_HAS_NODEFILE)
      {
      sprintf(buf, "%s/%s",
        path_aux,
        pjob->ji_qs.ji_jobid);
      put_env_var("PBS_NODEFILE", buf);
      }

    /* Set PBS_O_WORKDIR */
    {
    char *workdir_val;

    workdir_val = get_job_envvar(pjob,"PBS_O_WORKDIR");
    if (workdir_val != NULL)
      {
      put_env_var("PBS_O_WORKDIR", workdir_val);
      }
    }

    /* SET BEOWULF_JOB_MAP */

    {

    struct array_strings *vstrs;

    int VarIsSet = 0;
    int j;

    vstrs = pjob->ji_wattr[JOB_ATR_variables].at_val.at_arst;

    for (j = 0;j < vstrs->as_usedptr;++j)
      {
      if (!strncmp(
            vstrs->as_string[j],
            "BEOWULF_JOB_MAP=",
            strlen("BEOWULF_JOB_MAP=")))
        {
        VarIsSet = 1;

        break;
        }
      }

    if (VarIsSet == 1)
      {
      char *val = strchr(vstrs->as_string[j], '=');

      if (val != NULL)
        put_env_var("BEOWULF_JOB_MAP", val+1);
      }
    }

  /* Set some Moab env variables if they exist */

  if ((which == PE_PROLOG) || (which == PE_EPILOG))
    {
    char *tmp_val;

    for (aindex=0;aindex<moabenvcnt;aindex++)
      {
      tmp_val = get_job_envvar(pjob,moabenvs[aindex]);
      if (tmp_val != NULL)
        {
        put_env_var(moabenvs[aindex], tmp_val);
        }
      }
    }

  /*
   * if we want to run as user then we need to reset real user permissions
   * since it seems that some OSs use real not effective user id when execv'ing
   */

  if ((which == PE_PROLOGUSER) || 
      (which == PE_EPILOGUSER) || 
      (which == PE_PROLOGUSERJOB) || 
      (which == PE_EPILOGUSERJOB))
    {
    setuid_ext(pbsuser, TRUE);
    setegid(pbsgroup);

    if (setgid(pjob->ji_qs.ji_un.ji_momt.ji_exgid) != 0)
      {
      snprintf(log_buffer,sizeof(log_buffer),
        "setgid(%lu) for UID = %lu failed: %s\n",
        (unsigned long)pjob->ji_qs.ji_un.ji_momt.ji_exgid,
        (unsigned long)pjob->ji_qs.ji_un.ji_momt.ji_exuid,
        strerror(errno));
      
      log_err(errno, __func__, log_buffer);
     
      exit(-1);
      }
    
    if (setuid_ext(pjob->ji_qs.ji_un.ji_momt.ji_exuid, FALSE) != 0)
      {
      snprintf(log_buffer,sizeof(log_buffer),
        "setuid(%lu) failed: %s\n",
        (unsigned long)pjob->ji_qs.ji_un.ji_momt.ji_exuid,
        strerror(errno));
      
      log_err(errno, __func__, log_buffer);
     
      exit(-1);
      }
    }

    execv(pelog,arg);

    sprintf(log_buffer,"execv of %s failed: %s\n",
      pelog,
      strerror(errno));

    if (write_ac_socket(2, log_buffer, strlen(log_buffer)) == -1)
      {
      /* cannot write message to stderr */

      /* NO-OP */
      }

    fsync(2);

    exit(255);
    }  /* END else () */

  switch (run_exit)
    {
    case 0:

      /* SUCCESS */

      /* NO-OP */

      break;

    case - 3:

      pelog_err(pjob, pelog, run_exit,  (char *)"child wait interrupted");

      break;

    case - 4:

      pelog_err(pjob, pelog, run_exit,  (char *)"prolog/epilog timeout occurred, child cleaned up");

      break;

    case - 5:

      pelog_err(pjob, pelog, run_exit, (char *) "prolog/epilog timeout occurred, cannot kill child");

      break;

    default:

      pelog_err(pjob, pelog, run_exit,  (char *)"nonzero p/e exit status");

      break;
    }  /* END switch (run_exit) */

  return(run_exit);
  }  /* END run_pelog() */
コード例 #9
0
ファイル: req_modify.c プロジェクト: agrawalravi90/pbspro
void
req_modifyjob(struct batch_request *preq)
{
	int		 add_to_am_list = 0; /* if altered during sched cycle */
	int		 bad = 0;
	int		 jt;		/* job type */
	int		 newstate;
	int		 newsubstate;
	resource_def	*outsideselect = NULL;
	job		*pjob;
	svrattrl	*plist;
	resource	*presc;
	resource_def	*prsd;
	int		 rc;
	int		 running = 0;
	int		 sendmom = 0;
	char		hook_msg[HOOK_MSG_SIZE];
	int		mod_project = 0;
	pbs_sched	*psched;

	switch (process_hooks(preq, hook_msg, sizeof(hook_msg),
			pbs_python_set_interrupt)) {
		case 0:	/* explicit reject */
			reply_text(preq, PBSE_HOOKERROR, hook_msg);
			return;
		case 1:   /* explicit accept */
			if (recreate_request(preq) == -1) { /* error */
				/* we have to reject the request, as 'preq' */
				/* may have been partly modified            */
				strcpy(hook_msg,
					"modifyjob event: rejected request");
				log_event(PBSEVENT_ERROR, PBS_EVENTCLASS_HOOK,
					LOG_ERR, "", hook_msg);
				reply_text(preq, PBSE_HOOKERROR, hook_msg);
				return;
			}
			break;
		case 2:	/* no hook script executed - go ahead and accept event*/
			break;
		default:
			log_event(PBSEVENT_DEBUG2, PBS_EVENTCLASS_HOOK,
				LOG_INFO, "", "modifyjob event: accept req by default");
	}

	if (pseldef == NULL)  /* do one time to keep handy */
		pseldef = find_resc_def(svr_resc_def, "select", svr_resc_size);

	pjob = chk_job_request(preq->rq_ind.rq_modify.rq_objname, preq, &jt);
	if (pjob == NULL)
		return;

	if ((jt == IS_ARRAY_Single) || (jt == IS_ARRAY_Range)) {
		req_reject(PBSE_IVALREQ, 0, preq);
		return;
	}

	psched = find_sched_from_sock(preq->rq_conn);
	/* allow scheduler to modify job */
	if (psched == NULL) {
		/* provisioning job is not allowed to be modified */
		if ((pjob->ji_qs.ji_state == JOB_STATE_RUNNING) &&
			(pjob->ji_qs.ji_substate == JOB_SUBSTATE_PROVISION)) {
			req_reject(PBSE_BADSTATE, 0, preq);
			return;
		}
	}

	/* cannot be in exiting or transit, exiting has already be checked */

	if (pjob->ji_qs.ji_state == JOB_STATE_TRANSIT) {
		req_reject(PBSE_BADSTATE, 0, preq);
		return;
	}

	plist = (svrattrl *)GET_NEXT(preq->rq_ind.rq_modify.rq_attr);
	if (plist == NULL) {	/* nothing to do */
		reply_ack(preq);
		return;
	}

	/*
	 * Special checks must be made:
	 *	if during a scheduling cycle and certain attributes are altered,
	 *	   make a note of the job to prevent it from being run now;
	 *	if job is running, only certain attributes/resources can be
	 *	   altered.
	 */

	if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) {
		running = 1;
	}
	while (plist) {
		int i;

		i = find_attr(job_attr_def, plist->al_name, JOB_ATR_LAST);

		/*
		 * Is the attribute being altered one which could change
		 * scheduling (ATR_DFLAG_SCGALT set) and if a scheduling
		 * cycle is in progress, then set flag to add the job to list
		 * of jobs which cannot be run in this cycle.
		 * If the scheduler itself sends a modify job request,
		 * no need to delay the job until next cycle.
		 */
		if ((psched == NULL) && (scheduler_jobs_stat) && (job_attr_def[i].at_flags & ATR_DFLAG_SCGALT))
			add_to_am_list = 1;

		/* Is the attribute modifiable in RUN state ? */

		if (i < 0) {
			reply_badattr(PBSE_NOATTR, 1, plist, preq);
			return;
		}
		if ((running == 1) &&
			((job_attr_def[i].at_flags & ATR_DFLAG_ALTRUN) == 0)) {

			reply_badattr(PBSE_MODATRRUN, 1, plist, preq);
			return;
		}
		if (i == (int)JOB_ATR_resource) {

			prsd = find_resc_def(svr_resc_def, plist->al_resc,
				svr_resc_size);

			if (prsd == 0) {
				reply_badattr(PBSE_UNKRESC, 1, plist, preq);
				return;
			}

			/* is the specified resource modifiable while */
			/* the job is running                         */

			if (running) {

				if ((prsd->rs_flags & ATR_DFLAG_ALTRUN) == 0) {
					reply_badattr(PBSE_MODATRRUN, 1, plist, preq);
					return;
				}

				sendmom = 1;
			}

			/* should the resource be only in a select spec */

			if (prsd->rs_flags & ATR_DFLAG_CVTSLT && !outsideselect &&
				plist->al_atopl.value && plist->al_atopl.value[0]) {
				/* if "-lresource" is set and has non-NULL value,
				** remember as potential bad resource
				** if this appears along "select".
				*/
				outsideselect = prsd;
			}
		}
		if (strcmp(plist->al_name, ATTR_project) == 0) {
			mod_project = 1;
		} else if ((strcmp(plist->al_name, ATTR_runcount) == 0) &&
			((plist->al_flags & ATR_VFLAG_HOOK) == 0) &&
			(plist->al_value != NULL) &&
			(plist->al_value[0] != '\0') &&
			((preq->rq_perm & (ATR_DFLAG_MGWR | ATR_DFLAG_OPWR)) == 0) &&
		(atol(plist->al_value) < \
		    pjob->ji_wattr[(int)JOB_ATR_runcount].at_val.at_long)) {
			sprintf(log_buffer,
				"regular user %s@%s cannot decrease '%s' attribute value from %ld to %ld",
				preq->rq_user, preq->rq_host, ATTR_runcount,
				pjob->ji_wattr[(int)JOB_ATR_runcount].at_val.at_long,
				atol(plist->al_value));
			log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, LOG_ERR,
				pjob->ji_qs.ji_jobid, log_buffer);
			req_reject(PBSE_PERM, 0, preq);
			return;
		}
		plist = (svrattrl *)GET_NEXT(plist->al_link);
	}

	if (outsideselect) {
		presc = find_resc_entry(&pjob->ji_wattr[(int)JOB_ATR_resource],
			pseldef);
		if (presc &&
			((presc->rs_value.at_flags & ATR_VFLAG_DEFLT) == 0)) {
			/* select is not a default, so reject qalter */

			resc_in_err = strdup(outsideselect->rs_name);
			req_reject(PBSE_INVALJOBRESC, 0, preq);
			return;
		}

	}

	/* modify the jobs attributes */

	bad = 0;
	plist = (svrattrl *)GET_NEXT(preq->rq_ind.rq_modify.rq_attr);
	rc = modify_job_attr(pjob, plist, preq->rq_perm, &bad);
	if (rc) {
		if (pjob->ji_clterrmsg)
			reply_text(preq, rc, pjob->ji_clterrmsg);
		else
			reply_badattr(rc, bad, plist, preq);
		return;
	}

	/* If certain attributes modified and if in scheduling cycle  */
	/* then add to list of jobs which cannot be run in this cycle */

	if (add_to_am_list)
		am_jobs_add(pjob);	/* see req_runjob() */

	/* check if project attribute was requested to be modified to */
	/* be the default project value */
	if (mod_project && (pjob->ji_wattr[(int)JOB_ATR_project].at_flags & \
							ATR_VFLAG_SET)) {

		if (strcmp(pjob->ji_wattr[(int)JOB_ATR_project].at_val.at_str,
			PBS_DEFAULT_PROJECT) == 0) {
			sprintf(log_buffer, msg_defproject,
				ATTR_project, PBS_DEFAULT_PROJECT);
#ifdef NAS /* localmod 107 */
			log_event(PBSEVENT_DEBUG4, PBS_EVENTCLASS_JOB, LOG_INFO,
				pjob->ji_qs.ji_jobid, log_buffer);
#else
			log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, LOG_INFO,
				pjob->ji_qs.ji_jobid, log_buffer);
#endif /* localmod 107 */
		}
	}

	if (pjob->ji_wattr[(int)JOB_ATR_resource].at_flags & ATR_VFLAG_MODIFY) {
		presc = find_resc_entry(&pjob->ji_wattr[(int)JOB_ATR_resource],
			pseldef);
		if (presc && (presc->rs_value.at_flags & ATR_VFLAG_DEFLT)) {
			/* changing Resource_List and select is a default   */
			/* clear "select" so it is rebuilt inset_resc_deflt */
			pseldef->rs_free(&presc->rs_value);
		}
	}

	/* Reset any defaults resource limit which might have been unset */
	if ((rc = set_resc_deflt((void *)pjob, JOB_OBJECT, NULL)) != 0) {
		req_reject(rc, 0, preq);
		return;
	}

	/* if job is not running, may need to change its state */

	if (pjob->ji_qs.ji_state != JOB_STATE_RUNNING) {
		svr_evaljobstate(pjob, &newstate, &newsubstate, 0);
		(void)svr_setjobstate(pjob, newstate, newsubstate);
	} else {
		(void)job_save(pjob, SAVEJOB_FULL);
	}
	(void)sprintf(log_buffer, msg_manager, msg_jobmod,
		preq->rq_user, preq->rq_host);
	log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, LOG_INFO,
		pjob->ji_qs.ji_jobid, log_buffer);

	/* if a resource limit changed for a running job, send to MOM */

	if (sendmom) {
		rc = relay_to_mom(pjob, preq, post_modify_req);
		if (rc)
			req_reject(rc, 0, preq);    /* unable to get to MOM */
		return;
	}

	reply_ack(preq);
}
コード例 #10
0
ファイル: cray_cpa.c プロジェクト: j0hnf/torque
int CPADestroyPartition(

  job *pjob)

  {
  char id[] = "CPADestroyPartition";

  int rc;
  int ErrorP;        /* O - non-zero if users of partition encountered error */
  unsigned long ParID;
  unsigned long long AdminCookie;

  resource              *presc;         /* Requested Resource List */
  resource_def          *prd;
  attribute             *pattr;

  pattr = &pjob->ji_wattr[JOB_ATR_resource];

  prd = find_resc_def(svr_resc_def, "cpapartition", svr_resc_size);

  if (prd == NULL)
    {
    return(PBSE_SYSTEM);
    }

  if ((presc = find_resc_entry(pattr, prd)) == NULL)
    {
    return(PBSE_SYSTEM);  /* is this a real error? */
    }

  ParID = atoL(presc->rs_value.at_val.at_str);

  prd = find_resc_def(svr_resc_def, "cpaadmincookie", svr_resc_size);

  if (prd == NULL)
    {
    return(PBSE_SYSTEM);
    }

  if ((presc = find_resc_entry(pattr, prd)) == NULL)
    {
    return(PBSE_SYSTEM);  /* is this a real error? */
    }

  AdminCookie = atoL(presc->rs_value.at_val.at_str);

  if (LOGLEVEL >= 2)
    {
    printf("INFO:  destroying partition %lu with cookie %llu\n",
           ParID,
           AdminCookie);

    log_record(
      PBSEVENT_JOB,
      PBS_EVENTCLASS_JOB,
      pjob->ji_qs.ji_jobid,
      log_buffer);
    }

  /* will fail if yod is present */

  rc = cpa_destroy_partition(
         ParID,
         AdminCookie,
         &ErrorP);      /* O - if set, destroy failed on one or more tasks */

  if (rc != 0)
    {
    sprintf(log_buffer, "cpa_destroy_partition: rc=%d (%s)\n",
            rc,
            cpa_rc2str(rc));

    log_err(-1, id, log_buffer);

    return(1);
    }

  return(0);
  }  /* END CPADestroyPartition() */
コード例 #11
0
ファイル: cray_cpa.c プロジェクト: j0hnf/torque
int CPACreatePartition(

  job              *pjob,   /* I */
  struct var_table *vtab)   /* I */

  {
  char id[] = "CPACreatePartition";

  cpa_node_req_t *NodeReq;

  int rc;

  char *Value;

  char *Spec;

  int   PPN;
  int   Flags;
  int   Size = 0;
  int   UID;
  char *AcctID = NULL;
  char *JobID;
  char *HostList = NULL;  /* scheduler specified list of hosts to allocate (optional) */

  unsigned long      ParID;       /* O - partition id */
  unsigned long long AdminCookie; /* O - admin cookie */
  unsigned long long AllocCookie; /* O - alloc cookie */
  char longbuf[1024];

  resource            *presc;         /* Requested Resource List */
  resource_def        *prd;
  attribute           *pattr;

  int                  rc;

  cpa_nid_list_t       Wanted = NULL;

  /* first, get the size, uid, jobid, and subnodelist from the job */

  pattr = &pjob->ji_wattr[JOB_ATR_resource];
  prd = find_resc_def(svr_resc_def, "size", svr_resc_size);
  presc = find_resc_entry(pattr, prd);

  if (presc != NULL)
    {
    Size = presc->rs_value.at_val.at_long;
    }

  UID = pjob->ji_qs.ji_un.ji_momt.ji_exuid;

  if ((Size <= 0) || (UID < 0))
    {
    /* FAILURE */

    sprintf(log_buffer, "ERROR:  invalid parameters:  Size: %d  UID: %d  \n",
            Size,
            UID);

    log_err(-1, id, log_buffer);

    return(1);
    }

  pattr = &pjob->ji_wattr[JOB_ATR_resource];

  prd = find_resc_def(svr_resc_def, "subnode_list", svr_resc_size);
  presc = find_resc_entry(pattr, prd);

  if (presc != NULL)
    {
    HostList = presc->rs_value.at_val.at_string;
    }

  if (pjob->ji_wattr[JOB_ATR_account].at_flags & ATR_VFLAG_SET)
    {
    AcctID = pjob->ji_wattr[JOB_ATR_account].at_val.at_str;
    }

  JobID = pjob->ji_qs.ji_jobid;

  PPN = 1;       /* NOTE: not really supported w/in CPA, always use 1 */
  Flags = 0;     /* NOTE: only allocate compute hosts, always use 0 */
  Spec = NULL;   /* NOTE: required node specification, not used */

  if (HostList != NULL)
    {
    char tmpBuffer[256000];
    int  index;

    rc = nid_list_create(
           0,
           MaxListSize,  /* max count */
           0,
           MaxNID,       /* max value */
           &Wanted);     /* O */

    if (rc != 0)
      {
      /* FAILURE */

      printf("nid_list_create: rc=%d (%s)\n",
             rc,
             cpa_rc2str(rc));

      return(1);
      }

    strncpy(tmpBuffer, HostList, sizeof(tmpBuffer));

    tmpBuffer[sizeof(tmpBuffer) - 1] = '\0';

    for (index = 0;tmpBuffer[index] != '\0';index++)
      {
      if (tmpBuffer[index] == ':')
        tmpBuffer[index] = ',';
      }

    rc = nid_list_destringify(tmpBuffer, Wanted);

    if (rc != 0)
      {
      /* FAILURE */

      printf("nid_list_destringify: rc=%d (%s)\n",
             rc,
             cpa_rc2str(rc));

      nid_list_destroy(Wanted);

      return(1);
      }

    if (loglevel >= 3)
      {
      char *buf = NULL;
      int   bufsize = 0;

      rc = nid_list_stringify(Wanted, &buf, &bufsize);

      if (rc == 0)
        {
        snprintf(log_buffer, sizeof(log_buffer), "CPANodeList: %s\n",
                 buf);
        }
      else
        {
        snprintf(log_buffer, sizeof(log_buffer), "CPA nid_list_stringify: rc=%d\n",
                 rc);
        }

      log_record(

        PBSEVENT_JOB,
        PBS_EVENTCLASS_JOB,
        pjob->ji_qs.ji_jobid,
        log_buffer);

      free(buf);
      }
    }
  else
    {
    Wanted = NULL;
    }

  NodeReq = cpa_new_node_req(

              Size, /* number of procs/nodes required by job */
              PPN,
              Flags,
              Spec,
              Wanted);  /* I */

  if (NodeReq == NULL)
    {
    /* FAILURE:  cannot alloc memory for node req */

    sprintf(log_buffer, "cpa_new_node_req: NULL\n");

    log_err(-1, id, log_buffer);

    nid_list_destroy(Wanted);

    return(1);
    }

  rc = cpa_create_partition(

         NodeReq,
         CPA_BATCH,
         CPA_NOT_SPECIFIED,
         UID,
         (AcctID != NULL) ? AcctID : "DEFAULT",
         (cpa_partition_id_t *) & ParID, /* O */
         (cpa_cookie_t *) & AdminCookie, /* O */
         (cpa_cookie_t *) & AllocCookie);  /* O */

  if (rc != 0)
    {
    /* FAILURE */

    sprintf(log_buffer, "cpa_create_partition: rc=%d (%s)\n",
            rc,
            cpa_rc2str(rc));

    log_err(-1, id, log_buffer);

    nid_list_destroy(Wanted);

    return(1);
    }

  rc = cpa_assign_partition(

         (cpa_partition_id_t)ParID,
         (cpa_cookie_t)AdminCookie,
         JobID,
         1);     /* NOT CURRENTLY USED - should be set to NID of 'master host' */

  /* free memory, nid list no longer required */

  nid_list_destroy(Wanted);

  if (rc != 0)
    {
    /* FAILURE */

    sprintf(log_buffer, "cpa_assign_partition: rc=%d (%s)\n",
            rc,
            cpa_rc2str(rc));

    log_err(-1, id, log_buffer);

    return(1);
    }

  /* save the partition and cookies in the job and vtab */

  prd = find_resc_def(svr_resc_def, "cpapartition", svr_resc_size);

  if (prd == NULL)
    {
    return(PBSE_SYSTEM);
    }

  if ((presc = find_resc_entry(pattr, prd)) == NULL)
    {
    if ((presc = add_resource_entry(pattr, prd)) == NULL)
      {
      return(PBSE_SYSTEM);
      }
    }
  else
    {
    prd->rs_free(&presc->rs_value);
    }

  snprintf(longbuf, 1023, "%lu", ParID);

  prd->rs_decode(&presc->rs_value, NULL, NULL, longbuf);
  presc->rs_value.at_flags |= ATR_VFLAG_SET;
  bld_env_variables(vtab, "BATCH_PARTITION_ID", longbuf);

  prd = find_resc_def(svr_resc_def, "cpaadmincookie", svr_resc_size);

  if (prd == NULL)
    {
    return(PBSE_SYSTEM);
    }

  if ((presc = find_resc_entry(pattr, prd)) == NULL)
    {
    if ((presc = add_resource_entry(pattr, prd)) == NULL)
      {
      return(PBSE_SYSTEM);
      }
    }
  else
    {
    prd->rs_free(&presc->rs_value);
    }

  snprintf(longbuf, 1023, "%llu", AdminCookie);

  prd->rs_decode(&presc->rs_value, NULL, NULL, longbuf);
  presc->rs_value.at_flags |= ATR_VFLAG_SET;
  /* admincookie doesn't go into job env */

  prd = find_resc_def(svr_resc_def, "cpaalloccookie", svr_resc_size);

  if (prd == NULL)
    {
    return(PBSE_SYSTEM);
    }

  if ((presc = find_resc_entry(pattr, prd)) == NULL)
    {
    if ((presc = add_resource_entry(pattr, prd)) == NULL)
      {
      return(PBSE_SYSTEM);
      }
    }
  else
    {
    prd->rs_free(&presc->rs_value);
    }

  snprintf(longbuf, 1023, "%llu", AllocCookie);

  prd->rs_decode(&presc->rs_value, NULL, NULL, longbuf);
  presc->rs_value.at_flags |= ATR_VFLAG_SET;
  bld_env_variables(vtab, "BATCH_ALLOC_COOKIE", longbuf);

  bld_env_variables(vtab, "BATCH_JOBID", JobID);

  return(0);
  }  /* END CPACreatePartition() */
コード例 #12
0
ファイル: mom_job_func.c プロジェクト: ansonl/torque
void mom_job_purge(

  job *pjob)  /* I (modified) */

  {
  job_file_delete_info *jfdi;

  jfdi = (job_file_delete_info *)calloc(1, sizeof(job_file_delete_info));

  if (jfdi == NULL)
    {
    log_err(ENOMEM,__func__, (char *)"No space to allocate info for job file deletion");
    return;
    }

#ifdef NVIDIA_GPUS
  /*
   * Did this job have a gpuid assigned?
   * if so, then update gpu status
   */
  if (((pjob->ji_wattr[JOB_ATR_exec_gpus].at_flags & ATR_VFLAG_SET) != 0) &&
      (pjob->ji_wattr[JOB_ATR_exec_gpus].at_val.at_str != NULL))
    {
    send_update_soon();
    }
#endif  /* NVIDIA_GPUS */

  /* initialize struct information */
  if (pjob->ji_flags & MOM_HAS_TMPDIR)
    {
    jfdi->has_temp_dir = TRUE;
    pjob->ji_flags &= ~MOM_HAS_TMPDIR;
    }
  else
    jfdi->has_temp_dir = FALSE;

  strcpy(jfdi->jobid,pjob->ji_qs.ji_jobid);
  strcpy(jfdi->prefix,pjob->ji_qs.ji_fileprefix);

  if ((pjob->ji_wattr[JOB_ATR_checkpoint_dir].at_flags & ATR_VFLAG_SET) &&
      (pjob->ji_wattr[JOB_ATR_checkpoint_name].at_flags & ATR_VFLAG_SET))
    jfdi->checkpoint_dir = strdup(pjob->ji_wattr[JOB_ATR_checkpoint_dir].at_val.at_str);

  jfdi->gid = pjob->ji_qs.ji_un.ji_momt.ji_exgid;
  jfdi->uid = pjob->ji_qs.ji_un.ji_momt.ji_exuid;

  /* remove each pid in ji_job_pid_set from the global_job_sid_set */
  for (job_pid_set_t::const_iterator job_pid_set_iter = pjob->ji_job_pid_set->begin();
       job_pid_set_iter != pjob->ji_job_pid_set->end();
       job_pid_set_iter++)
    {
    /* get pid entry from ji_job_pid_set */
    pid_t job_pid = *job_pid_set_iter;

    /* see if job_pid exists in job_sid set */
    job_pid_set_t::const_iterator it = global_job_sid_set.find(job_pid);
    if (it != global_job_sid_set.end())
      {
      /* remove job_pid from the set */
      global_job_sid_set.erase(it);
      }
    }

  if (thread_unlink_calls == TRUE)
    enqueue_threadpool_request(delete_job_files, jfdi, request_pool);
  else
    delete_job_files(jfdi);

  /* remove this job from the global queue */
  delete_link(&pjob->ji_jobque);
  delete_link(&pjob->ji_alljobs);

  remove_from_exiting_list(pjob);

  if (LOGLEVEL >= 6)
    {
    sprintf(log_buffer,"removing job");

    log_record(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buffer);
    }

#if IBM_SP2==2        /* IBM SP PSSP 3.1 */
  unload_sp_switch(pjob);

#endif   /* IBM SP */

  //We had a request to change the frequency for the job and now that the job is done
  //we want to change the frequency back.
  resource *presc = find_resc_entry(&pjob->ji_wattr[JOB_ATR_resource],
            find_resc_def(svr_resc_def, "cpuclock", svr_resc_size));
  if (presc != NULL)
    {
    std::string beforeFreq;

    nd_frequency.get_frequency_string(beforeFreq);
    if(!nd_frequency.restore_frequency())
      {
      std::string msg = "Failed to restore frequency.";
      log_ext(nd_frequency.get_last_error(),__func__,msg.c_str(),LOG_ERR);
      }
    else
      {
      std::string afterFreq;
      nd_frequency.get_frequency_string(afterFreq);
      std::string msg = "Restored frequency from " + beforeFreq + " to " + afterFreq;
      log_ext(PBSE_CHANGED_CPU_FREQUENCY,__func__, msg.c_str(),LOG_NOTICE);
      }
    }

  mom_job_free(pjob);

  /* if no jobs are left, check if MOM should be restarted */

  if (((job *)GET_NEXT(svr_alljobs)) == NULL)
    MOMCheckRestart();

  return;
  }  /* END mom_job_purge() */
コード例 #13
0
ファイル: resc_attr.c プロジェクト: A9-William/pbspro
int
decode_place(struct attribute *patr, char *name, char *rescn, char *val)
{
#ifndef PBS_MOM
	int   have_oneof = 0;
	int   i;
	size_t ln;
	char  h;
	char *pc;
	char *px;
	struct resource_def *pres;

	extern int have_blue_gene_nodes;	/* BLUE GENE only */

	pc = val;

	while (1) {
		while (isspace((int)*pc))
			++pc;
		if (*pc == '\0' || !isalpha((int)*pc))
			return PBSE_BADATVAL;
		/* found start of word,  look for end of word */
		px = pc+1;
		while (isalpha((int)*px))
			px++;

		for (i=0; i<sizeof(place_words)/sizeof(place_words[0]); ++i) {
			if (strlen(place_words[i].pw_word) >= (size_t)(px-pc))
				ln = strlen(place_words[i].pw_word);
			else
				ln = (size_t)(px - pc);
			if (strncasecmp(pc, place_words[i].pw_word, ln) == 0) {
				break;
			}
		}
		if (i == sizeof(place_words)/sizeof(place_words[0]))
			return PBSE_BADATVAL;

		if (place_words[i].pw_oneof & have_oneof)
			return PBSE_BADATVAL;
		have_oneof |= place_words[i].pw_oneof;

		if (place_words[i].pw_equalstr) {
			if (*px != '=')
				return PBSE_BADATVAL;
			pc = ++px;
			while ((isalnum((int)*px) || (*px == '_') || (*px == '-')) &&
				(*px != ':'))
				++px;
			if (pc == px)
				return PBSE_BADATVAL;
			/* now need to see if the value is a valid resource/type */
			h = *px;
			*px = '\0';
			pres = find_resc_def(svr_resc_def, pc, svr_resc_size);
			if (pres == NULL)
				return PBSE_UNKRESC;
			if ((pres->rs_type != ATR_TYPE_STR) &&
				(pres->rs_type != ATR_TYPE_ARST))
				return PBSE_RESCNOTSTR;
			*px = h;

			if (*px == '\0')
				break;
			else if (*px != ':')
				return PBSE_BADATVAL;
		}
		pc = px;
		if (*pc == '\0')
			break;
		else if (*pc != ':')
			return PBSE_BADATVAL;
		pc++;
	}

	/* BLUE GENE only  - cannot have bgl nodes and jobs with "group=" */

	if (have_blue_gene_nodes != 0) {
		if (strstr(val, "group=") != NULL)
			return PBSE_NGBLUEGENE;
	}
#endif	/* not PBS_MOM */

	return (decode_str(patr, name, rescn, val));
}
コード例 #14
0
ファイル: resc_attr.c プロジェクト: A9-William/pbspro
int
set_node_ct(resource *pnodesp, attribute *pattr, void *pobj, int type, int actmode)
{
#ifndef PBS_MOM
	int		 nn;		/* num of nodes */
	int		 nt;		/* num of tasks (processes) */
	int		 hcpp = 0;	/* has :ccp in string */
	long		 nc;
	resource	*pnct;
	resource	*pncpus;
	resource_def	*pndef;

	if ((actmode == ATR_ACTION_RECOV) ||
		((pnodesp->rs_value.at_flags & ATR_VFLAG_SET) == 0))
		return (0);

	/* first validate the spec */

	if ((nn = validate_nodespec(pnodesp->rs_value.at_val.at_str)) != 0)
		return nn;

	/* Set "nodect" to count of nodes in "nodes" */

	pndef = find_resc_def(svr_resc_def, "nodect", svr_resc_size);
	if (pndef == (resource_def *)0)
		return (PBSE_SYSTEM);

	if ((pnct = find_resc_entry(pattr, pndef)) == (resource *)0) {
		if ((pnct = add_resource_entry(pattr, pndef)) == 0)
			return (PBSE_SYSTEM);
	}

	nn = ctnodes(pnodesp->rs_value.at_val.at_str);
	pnct->rs_value.at_val.at_long = nn;
	pnct->rs_value.at_flags |= ATR_VFLAG_SET|ATR_VFLAG_MODCACHE;

	/* find the number of cpus specified in the node string */

	nt = ctcpus(pnodesp->rs_value.at_val.at_str, &hcpp);

	/* Is "ncpus" set as a separate resource? */

	pndef = find_resc_def(svr_resc_def, "ncpus", svr_resc_size);
	if (pndef == (resource_def *)0)
		return (PBSE_SYSTEM);
	if ((pncpus = find_resc_entry(pattr, pndef)) == (resource *)0) {
		if ((pncpus = add_resource_entry(pattr, pndef)) == 0)
			return (PBSE_SYSTEM);
	}

	if (((pncpus->rs_value.at_flags & (ATR_VFLAG_SET | ATR_VFLAG_DEFLT)) ==
		ATR_VFLAG_SET) && (actmode == ATR_ACTION_NEW)) {
		/* ncpus is already set and not a default and new job */

		nc = pncpus->rs_value.at_val.at_long;
		if (hcpp && (nt != pncpus->rs_value.at_val.at_long)) {
			/* if cpp string specificed, this is an error */
			return (PBSE_BADATVAL);
		} else if ((nc % nt) != 0) {
			/* ncpus must be multiple of number of tasks */
			return (PBSE_BADATVAL);
		}

	} else {
		/* ncpus is not set or not a new job (qalter being done) */
		/* force ncpus to the correct thing */
		pncpus->rs_value.at_val.at_long = nt;
		pncpus->rs_value.at_flags |= (ATR_VFLAG_SET|ATR_VFLAG_MODCACHE);
	}


#endif	/* not MOM */
	return (0);
}
コード例 #15
0
ファイル: job_route.c プロジェクト: spuder/torque
/* int initialize_procct - set pjob->procct plus the resource
 * procct in the Resource_List
 *
 * Assumes the nodes resource has been set on the Resource_List. This should
 * have been done in req_quejob with the set_nodes_attr() function or in
 * set_node_ct and/or set_proc_ct.
 *
 * Returns 0 on success. Non-zero on failure
 */
int initialize_procct(job *pjob)
{
    char id[] = "initialize_procct";
    resource     *pnodesp = NULL;
    resource_def *pnodes_def = NULL;
    resource     *pprocsp = NULL;
    resource_def *pprocs_def = NULL;
    resource     *procctp = NULL;
    resource_def *procct_def = NULL;
    pbs_attribute    *pattr = NULL;

    pattr = &pjob->ji_wattr[JOB_ATR_resource];
    if(pattr == NULL)
    {
        /* Something is really wrong. ji_wattr[JOB_ATR_resource] should always be set
           by the time this function is called */
        sprintf(log_buffer, "%s: Resource_List is NULL. Cannot proceed", id);
        log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buffer);
        pbs_errno = PBSE_INTERNAL;
        return(ROUTE_PERM_FAILURE);
    }

    /* Has nodes been initialzed */
    if(pattr->at_flags & ATR_VFLAG_SET)
    {
        /* get the node spec from the nodes resource */
        pnodes_def = find_resc_def(svr_resc_def, "nodes", svr_resc_size);
        if(pnodes_def == NULL)
        {
            sprintf(log_buffer, "%s: Could not get nodes resource definition. Cannot proceed", id);
            log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buffer);
            pbs_errno = PBSE_INTERNAL;
            return(ROUTE_PERM_FAILURE);
        }
        pnodesp = find_resc_entry(pattr, pnodes_def);

        /* Get the procs count if the procs resource attribute is set */
        pprocs_def = find_resc_def(svr_resc_def, "procs", svr_resc_size);
        if(pprocs_def != NULL)
        {
            /* if pprocs_def is NULL we just go on. Otherwise we will get its value now */
            pprocsp = find_resc_entry(pattr, pprocs_def);
            /* We will evaluate pprocsp later. If it is null we do not care */
        }

        /* if neither pnodesp nor pprocsp are set, terminate */
        if(pnodesp == NULL && pprocsp == NULL)
        {
            /* nodes and procs were not set. Hopefully req_quejob set procct to 1 for us already */
            procct_def = find_resc_def(svr_resc_def, "procct", svr_resc_size);
            if(procct_def == NULL)
            {
                sprintf(log_buffer, "%s: Could not get procct resource definition. Cannot proceed", id);
                log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buffer);
                pbs_errno = PBSE_INTERNAL;
                return(ROUTE_PERM_FAILURE);
            }
            procctp = find_resc_entry(pattr, procct_def);
            if(procctp == NULL)
            {
                sprintf(log_buffer, "%s: Could not get nodes nor procs entry from Resource_List. Cannot proceed", id);
                log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buffer);
                pbs_errno = PBSE_INTERNAL;
                return(ROUTE_PERM_FAILURE);
            }
        }

        /* we now set pjob->procct and we also set the resource attribute procct */
        procct_def = find_resc_def(svr_resc_def, "procct", svr_resc_size);
        if(procct_def == NULL)
        {
            sprintf(log_buffer, "%s: Could not get procct resource definition. Cannot proceed", id);
            log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buffer);
            pbs_errno = PBSE_INTERNAL;
            return(ROUTE_PERM_FAILURE);
        }
        procctp = find_resc_entry(pattr, procct_def);
        if(procctp == NULL)
        {
            procctp = add_resource_entry(pattr, procct_def);
            if(procctp == NULL)
            {
                sprintf(log_buffer, "%s: Could not add procct resource. Cannot proceed", id);
                log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buffer);
                pbs_errno = PBSE_INTERNAL;
                return(ROUTE_PERM_FAILURE);
            }
        }

        /* Finally the moment of truth. We have the nodes and procs resources. Add them
           to the procct resoruce*/
        procctp->rs_value.at_val.at_long = 0;
        if(pnodesp != NULL)
        {
            procctp->rs_value.at_val.at_long = count_proc(pnodesp->rs_value.at_val.at_str);
        }

        if(pprocsp != NULL)
        {
            procctp->rs_value.at_val.at_long += pprocsp->rs_value.at_val.at_long;
        }
        procctp->rs_value.at_flags |= ATR_VFLAG_SET;
    }
    else
    {
        /* Something is really wrong. ji_wattr[JOB_ATR_resource] should always be set
           by the time this function is called */
        sprintf(log_buffer, "%s: Resource_List not set. Cannot proceed", id);
        log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buffer);
        pbs_errno = PBSE_INTERNAL;
        return(ROUTE_PERM_FAILURE);
    }

    return(PBSE_NONE);
} /* END initialize_procct */
コード例 #16
0
void req_quejob(

  struct batch_request *preq) /* ptr to the decoded request   */

  {
  char  *id = "req_quejob";

  char   basename[PBS_JOBBASE + 1];
  int    created_here = 0;
  int    index;
  char  *jid;
  attribute_def *pdef;
  job   *pj;
  svrattrl *psatl;
  int    rc;
  int    sock = preq->rq_conn;

  int    IsCheckpoint = 0;

  /* set basic (user) level access permission */

  resc_access_perm = ATR_DFLAG_USWR | ATR_DFLAG_Creat;

  if (PBSNodeCheckProlog)
    {
    check_state(1);

    mom_server_all_update_stat();

    if (internal_state & INUSE_DOWN)
      {
      req_reject(PBSE_MOMREJECT,0,preq,NULL,NULL);

      return;
      }
    }

  if (preq->rq_fromsvr)
    {
    /* from another server - accept the extra attributes */

    resc_access_perm |= ATR_DFLAG_MGWR | ATR_DFLAG_SvWR | ATR_DFLAG_MOM;

    jid = preq->rq_ind.rq_queuejob.rq_jid;
    }
  else
    {
    /* request must be from server */

    log_err(errno, id, "request not from server");

    req_reject(PBSE_IVALREQ, 0, preq, NULL, "request not received from server");

    return;
    }

  /* does job already exist, check both old and new jobs */

  if ((pj = find_job(jid)) == NULL)
    {
    pj = (job *)GET_NEXT(svr_newjobs);

    while (pj != NULL)
      {
      if (!strcmp(pj->ji_qs.ji_jobid, jid))
        break;

      pj = (job *)GET_NEXT(pj->ji_alljobs);
      }
    }

  /*
   * New job ...
   *
   * for MOM - rather than make up a hashname, we use the name sent
   * to us by the server as an attribute.
   */

  psatl = (svrattrl *)GET_NEXT(preq->rq_ind.rq_queuejob.rq_attr);

  while (psatl != NULL)
    {
    if (!strcmp(psatl->al_name,ATTR_hashname))
      {
      strcpy(basename,psatl->al_value);

      break;
      }

    psatl = (svrattrl *)GET_NEXT(psatl->al_link);
    }

  if (pj != NULL)
    {
    /* newly queued job already exists */

    if (pj->ji_qs.ji_substate == JOB_SUBSTATE_RUNNING)
      {
      /* FAILURE - job exists and is running */

      log_err(errno,id,"cannot queue new job, job exists and is running");

      req_reject(PBSE_JOBEXIST,0,preq,NULL,"job is running");

      return;
      }

    /* if checkpointed, then keep old and skip rest of process */

    if (pj->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE)
      {
      IsCheckpoint = 1;
      }  /* END if (pj->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) */
    else
      {
      /* unlink job from svr_alljobs since it will be placed on newjobs */

      delete_link(&pj->ji_alljobs);
      }
    }  /* END if (pj != NULL) */
  else
    {
    /* if not already here, allocate job struct */

    if ((pj = job_alloc()) == NULL)
      {
      /* FAILURE */

      req_reject(PBSE_SYSTEM, 0, preq, NULL, "cannot allocate new job structure");

      return;
      }
    }    /* END else (pj != NULL) */

  if (IsCheckpoint == 0)
    {
    strcpy(pj->ji_qs.ji_jobid,jid);

    strcpy(pj->ji_qs.ji_fileprefix,basename);

    pj->ji_modified       = 1;

    pj->ji_qs.ji_svrflags = created_here;

    pj->ji_qs.ji_un_type  = JOB_UNION_TYPE_NEW;
    }

  /* decode attributes from request into job structure */

  psatl = (svrattrl *)GET_NEXT(preq->rq_ind.rq_queuejob.rq_attr);

  while (psatl != NULL)
    {
    if (IsCheckpoint == 1)
      {
      if (strcmp(psatl->al_name,ATTR_checkpoint_name) &&
          strcmp(psatl->al_name,ATTR_v))
        {
        psatl = (svrattrl *)GET_NEXT(psatl->al_link);

        continue;
        }
      }

    /* identify the attribute by name */

    index = find_attr(job_attr_def,psatl->al_name,JOB_ATR_LAST);

    if (index < 0)
      {
      /* FAILURE */

      /* didn`t recognize the name */

      job_purge(pj);   /* CRI - 12/20/2004 */

      reply_badattr(PBSE_NOATTR,1,psatl,preq);

      return;
      }

    pdef = &job_attr_def[index];

    /* Is attribute not writeable by manager or by a server? */

    if ((pdef->at_flags & resc_access_perm) == 0)
      {
      /* FAILURE */

      job_purge(pj);

      reply_badattr(PBSE_ATTRRO,1,psatl,preq);

      return;
      }

    /* decode attribute */

    if (!strcmp(psatl->al_name,ATTR_v))
      {
      rc = decode_arst_merge(
             &pj->ji_wattr[index],
             psatl->al_name,
             psatl->al_resc,
             psatl->al_value);
      }
    else
      {
      rc = pdef->at_decode(
             &pj->ji_wattr[index],
             psatl->al_name,
             psatl->al_resc,
             psatl->al_value);
      }

    if (rc != 0)
      {
      /* FAILURE */

      /* all errors are fatal for MOM */

      job_purge(pj);

      reply_badattr(rc,1,psatl,preq);

      return;
      }

    if (psatl->al_op == DFLT)
      {
      if (psatl->al_resc)
        {
        resource     *presc;
        resource_def *prdef;

        prdef = find_resc_def(svr_resc_def,psatl->al_resc,svr_resc_size);

        if (prdef == NULL)
          {
          job_purge(pj);

          reply_badattr(rc,1,psatl, preq);

          return;
          }

        presc = find_resc_entry(&pj->ji_wattr[index],prdef);

        if (presc != NULL)
          presc->rs_value.at_flags |= ATR_VFLAG_DEFLT;
        }
      else
        {
        pj->ji_wattr[index].at_flags |= ATR_VFLAG_DEFLT;
        }
      }    /* END if (psatl->al_op == DFLT) */

    psatl = (svrattrl *)GET_NEXT(psatl->al_link);
    }      /* END while (psatl != NULL) */

  if (IsCheckpoint == 1)
    {
    pj->ji_qs.ji_substate = JOB_SUBSTATE_TRANSIN;

    if (reply_jobid(preq,pj->ji_qs.ji_jobid,BATCH_REPLY_CHOICE_Queue) == 0)
      {
      delete_link(&pj->ji_alljobs);

      append_link(&svr_newjobs,&pj->ji_alljobs,pj);

      pj->ji_qs.ji_un_type = JOB_UNION_TYPE_NEW;
      pj->ji_qs.ji_un.ji_newt.ji_fromsock = sock;
      pj->ji_qs.ji_un.ji_newt.ji_fromaddr = get_connectaddr(sock);
      pj->ji_qs.ji_un.ji_newt.ji_scriptsz = 0;

      /* Per Eric R., req_mvjobfile was giving error in open_std_file, 
         showed up as fishy error message */

      if (pj->ji_grpcache != NULL)
        {
        free(pj->ji_grpcache);
        pj->ji_grpcache = NULL;
        }
      }
    else
      {
      close_conn(sock);
      }

    /* SUCCESS */

    return;
    }

  /* set remaining job structure elements */

  pj->ji_qs.ji_state =    JOB_STATE_TRANSIT;

  pj->ji_qs.ji_substate = JOB_SUBSTATE_TRANSIN;

  pj->ji_wattr[(int)JOB_ATR_mtime].at_val.at_long = (long)time_now;

  pj->ji_wattr[(int)JOB_ATR_mtime].at_flags |= ATR_VFLAG_SET;

  pj->ji_qs.ji_un_type = JOB_UNION_TYPE_NEW;

  pj->ji_qs.ji_un.ji_newt.ji_fromsock = sock;

  pj->ji_qs.ji_un.ji_newt.ji_fromaddr = get_connectaddr(sock);

  pj->ji_qs.ji_un.ji_newt.ji_scriptsz = 0;

  /* acknowledge the request with the job id */

  if (reply_jobid(preq, pj->ji_qs.ji_jobid, BATCH_REPLY_CHOICE_Queue) != 0)
    {
    /* reply failed, purge the job and close the connection */

    close_conn(sock);

    job_purge(pj);

    return;
    }

  /* link job into server's new jobs list request  */

  append_link(&svr_newjobs, &pj->ji_alljobs, pj);

  return;
  }  /* END req_quejob() */
コード例 #17
0
static job *chk_job_torun(

  struct batch_request *preq,  /* I */
  int                   setnn) /* I */

  {
  static char *id = "chk_job_torun";

  job              *pjob;

  struct rq_runjob *prun;
  int               rc;

  char              EMsg[1024];
  char              FailHost[1024];
  char              exec_host[1024];
  char              *ptr;

  prun = &preq->rq_ind.rq_run;

  if ((pjob = chk_job_request(prun->rq_jid, preq)) == 0)
    {
    /* FAILURE */

    return(NULL);
    }

  if ((pjob->ji_qs.ji_state == JOB_STATE_TRANSIT) ||
      (pjob->ji_qs.ji_state == JOB_STATE_EXITING) ||
      (pjob->ji_qs.ji_substate == JOB_SUBSTATE_STAGEGO) ||
      (pjob->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN)  ||
      (pjob->ji_qs.ji_substate == JOB_SUBSTATE_RUNNING))
    {
    /* FAILURE - job already started */

    req_reject(PBSE_BADSTATE, 0, preq, NULL, "job already running");

    return(NULL);
    }

  if (preq->rq_type == PBS_BATCH_StageIn)
    {
    if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_STAGEIN)
      {
      /* FAILURE */

      req_reject(PBSE_BADSTATE, 0, preq, NULL, NULL);

      return(NULL);
      }
    }

  if ((preq->rq_perm & (ATR_DFLAG_MGWR | ATR_DFLAG_OPWR)) == 0)
    {
    /* FAILURE - run request not authorized */

    req_reject(PBSE_PERM, 0, preq, NULL, NULL);

    return(NULL);
    }

  if (pjob->ji_qhdr->qu_qs.qu_type != QTYPE_Execution)
    {
    /* FAILURE - job must be in execution queue */

    log_err(-1, id, "attempt to start job in non-execution queue");

    req_reject(PBSE_IVALREQ, 0, preq, NULL, "job not in execution queue");

    return(NULL);
    }

  /* where to execute the job */

#ifdef ENABLE_BLCR
  if (pjob->ji_qs.ji_svrflags & JOB_SVFLG_StagedIn)
#else
  if (pjob->ji_qs.ji_svrflags & (JOB_SVFLG_CHECKPOINT_FILE | JOB_SVFLG_StagedIn))
#endif
    {
    /* job has been checkpointed or files already staged in */
    /* in this case, exec_host must be already set          */

    if (prun->rq_destin && *prun->rq_destin) /* If a destination has been specified */
      {
      /* specified destination must match exec_host */

      strcpy(exec_host, pjob->ji_wattr[(int)JOB_ATR_exec_host].at_val.at_str);

      if ((ptr = strchr(exec_host, '/')))
        * ptr = 0; /* For some reason, node name has "/0" on the end (i.e. "node0001/0"). */

      if (strcmp(prun->rq_destin, exec_host) != 0)
        {
        /* FAILURE */

        if (pjob->ji_qs.ji_svrflags & (JOB_SVFLG_CHECKPOINT_FILE))
          req_reject(PBSE_EXECTHERE, 0, preq, NULL, "allocated nodes must match checkpoint location");
        else
          req_reject(PBSE_EXECTHERE, 0, preq, NULL, "allocated nodes must match input file stagein location");

        return(NULL);
        }
      }

    if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_HasNodes) == 0)
      {
      /* re-reserve nodes and leave exec_host as is */

      if ((rc = assign_hosts(  /* inside chk_job_torun() */
                  pjob,
                  pjob->ji_wattr[(int)JOB_ATR_exec_host].at_val.at_str,
                  0,
                  FailHost,
                  EMsg)) != 0)   /* O */
        {
        req_reject(PBSE_EXECTHERE, 0, preq, FailHost, EMsg);

        return(NULL);
        }
      }
    }    /* END if (pjob->ji_qs.ji_svrflags & (JOB_SVFLG_CHECKPOINT_FILE|JOB_SVFLG_StagedIn)) */
  else
    {
    /* make sure exec gpus is clear */
    if (((pjob->ji_wattr[JOB_ATR_exec_gpus].at_flags & ATR_VFLAG_SET) != 0) &&
      (pjob->ji_wattr[JOB_ATR_exec_gpus].at_val.at_str != NULL))
      {
      job_attr_def[(int)JOB_ATR_exec_gpus].at_free(
        &pjob->ji_wattr[JOB_ATR_exec_gpus]);
      }

    /* job has not run before or need not run there again */
    /* reserve nodes and set new exec_host */
    if ((prun->rq_destin == NULL) || (prun->rq_destin[0] == '\0'))
      {
      /* it is possible for the scheduler to pass a hostlist using the 
       * rq_extend field--we should use it as the given list
       * as an alternative to rq_destin */

      rc = assign_hosts(pjob, preq->rq_extend, 1, FailHost, EMsg);  /* inside chk_job_torun() */
      }
    else
      {
      rc = assign_hosts(pjob, prun->rq_destin, 1, FailHost, EMsg);  /* inside chk_job_torun() */
      }

    if (rc != 0)
      {
      /* FAILURE - cannot essign correct hosts */

      req_reject(rc, 0, preq, FailHost, EMsg);

      return(NULL);
      }
    }

  if (setnn == 1)
    {
#ifdef TDEV
    /* what should neednodes be set to? */

    resource_def *DRes;  /* resource definition */

    resource *JRes;      /* resource on job */

    attribute *Attr;     /* 'neednodes' attribute */

    Attr = &pjob->ji_wattr[(int)JOB_ATR_resource];

    DRes = find_resc_def(svr_resc_def, "neednodes", svr_resc_size);

    JRes = find_resc_entry(Attr, DRes);

    if ((JRes == NULL) ||
        ((JRes->rs_value.at_flags & ATR_VFLAG_SET) == 0))
      {
      /* resource does not exist or value is not set */

      if (JRes == NULL)
        {
        JRes = add_resource_entry(Attr, DRes);
        }

      if (JRes != NULL)
        {
        if (DRes->rs_defin->rs_set(
              &JRes->rs_value,
              &DRes->rs_value,
              SET) == 0)
          {
          JRes->rs_value.at_flags |= ATR_VFLAG_SET;
          }
        }
      }

#endif /* TDEV */
    }    /* END if (setnn == 1) */

  return(pjob);
  }  /* END chk_job_torun() */
コード例 #18
0
ファイル: req_modify.c プロジェクト: dhill12/test
int modify_job(

  void                 **j,               /* O */
  svrattrl              *plist,           /* I */
  struct batch_request  *preq,            /* I */
  int                    checkpoint_req,  /* I */
  int                    flag)            /* I */

  {
  int   bad = 0;
  int   i;
  int   newstate;
  int   newsubstate;
  resource_def *prsd;
  int   rc;
  int   sendmom = 0;
  int   copy_checkpoint_files = FALSE;

  char  log_buf[LOCAL_LOG_BUF_SIZE];
  struct batch_request *dup_req = NULL;

  job *pjob = (job *)*j;
  
  if (pjob == NULL)
    {
    sprintf(log_buf, "job structure is NULL");
    log_err(PBSE_IVALREQ, __func__, log_buf);
    return(PBSE_IVALREQ);
    }

  /* cannot be in exiting or transit, exiting has already been checked */

  if (pjob->ji_qs.ji_state == JOB_STATE_TRANSIT)
    {
    /* FAILURE */
    snprintf(log_buf,sizeof(log_buf),
      "Cannot modify job '%s' in transit\n",
      pjob->ji_qs.ji_jobid);

    log_err(PBSE_BADSTATE, __func__, log_buf);

    return(PBSE_BADSTATE);
    }

  if (((checkpoint_req == CHK_HOLD) || (checkpoint_req == CHK_CONT)) &&
      (pjob->ji_qs.ji_substate == JOB_SUBSTATE_RUNNING))
    {
    /* May need to request copy of the checkpoint file from mom */

    copy_checkpoint_files = TRUE;

    if (checkpoint_req == CHK_HOLD)
      {

      sprintf(log_buf,"setting jobsubstate for %s to RERUN\n", pjob->ji_qs.ji_jobid);

      pjob->ji_qs.ji_substate = JOB_SUBSTATE_RERUN;

      job_save(pjob, SAVEJOB_QUICK, 0);

      log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf);

      /* remove checkpoint restart file if there is one */
      
      if (pjob->ji_wattr[JOB_ATR_restart_name].at_flags & ATR_VFLAG_SET)
        {
        cleanup_restart_file(pjob);
        }

      }
    }

  /* if job is running, special checks must be made */

  /* NOTE:  must determine if job exists down at MOM - this will occur if
            job is running, job is held, or job was held and just barely
            released (ie qhold/qrls) */

  /* COMMENTED OUT BY JOSH B IN 2.3 DUE TO MAJOR PROBLEMS w/ CUSTOMERS
   * --FIX and uncomment once we know what is really going on.
   *
   * We now know that ji_destin gets set on a qmove and that the mom does not
   * have the job at that point.
   *
  if ((pjob->ji_qs.ji_state == JOB_STATE_RUNNING) ||
     ((pjob->ji_qs.ji_state == JOB_STATE_HELD) && (pjob->ji_qs.ji_destin[0] != '\0')) ||
     ((pjob->ji_qs.ji_state == JOB_STATE_QUEUED) && (pjob->ji_qs.ji_destin[0] != '\0')))
  */
  if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING)
    {
    while (plist != NULL)
      {
      /* is the pbs_attribute modifiable in RUN state ? */

      i = find_attr(job_attr_def, plist->al_name, JOB_ATR_LAST);

      if ((i < 0) ||
          ((job_attr_def[i].at_flags & ATR_DFLAG_ALTRUN) == 0))
        {
        /* FAILURE */
        snprintf(log_buf,sizeof(log_buf),
          "Cannot modify attribute '%s' while running\n",
          plist->al_name);
        log_err(PBSE_MODATRRUN, __func__, log_buf);

        return PBSE_MODATRRUN;
        }

      /* NOTE:  only explicitly specified job attributes are routed down to MOM */

      if (i == JOB_ATR_resource)
        {
        /* is the specified resource modifiable while */
        /* the job is running                         */

        prsd = find_resc_def(svr_resc_def, plist->al_resc, svr_resc_size);

        if (prsd == NULL)
          {
          /* FAILURE */
          snprintf(log_buf,sizeof(log_buf),
            "Unknown attribute '%s'\n",
            plist->al_name);

          log_err(PBSE_UNKRESC, __func__, log_buf);

          return(PBSE_UNKRESC);
          }

        if ((prsd->rs_flags & ATR_DFLAG_ALTRUN) == 0)
          {
          /* FAILURE */
          snprintf(log_buf,sizeof(log_buf),
            "Cannot modify attribute '%s' while running\n",
            plist->al_name);
          log_err(PBSE_MODATRRUN, __func__, log_buf);

          return(PBSE_MODATRRUN);
          }

        sendmom = 1;
        }
/*
        else if ((i == JOB_ATR_checkpoint_name) || (i == JOB_ATR_variables))
        {
        sendmom = 1;
        }
*/

      plist = (svrattrl *)GET_NEXT(plist->al_link);
      }
    }    /* END if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) */

  /* modify the job's attributes */

  bad = 0;

  plist = (svrattrl *)GET_NEXT(preq->rq_ind.rq_modify.rq_attr);

  rc = modify_job_attr(pjob, plist, preq->rq_perm, &bad);

  if (rc)
    {
    /* FAILURE */
    snprintf(log_buf,sizeof(log_buf),
      "Cannot set attributes for job '%s'\n",
      pjob->ji_qs.ji_jobid);
    log_err(rc, __func__, log_buf);

    if (rc == PBSE_JOBNOTFOUND)
      *j = NULL;

    return(rc);
    }

  /* Reset any defaults resource limit which might have been unset */

  set_resc_deflt(pjob, NULL, FALSE);

  /* if job is not running, may need to change its state */

  if (pjob->ji_qs.ji_state != JOB_STATE_RUNNING)
    {
    svr_evaljobstate(pjob, &newstate, &newsubstate, 0);

    svr_setjobstate(pjob, newstate, newsubstate, FALSE);
    }
  else
    {
    job_save(pjob, SAVEJOB_FULL, 0);
    }

  sprintf(log_buf, msg_manager, msg_jobmod, preq->rq_user, preq->rq_host);

  log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);

  /* if a resource limit changed for a running job, send to MOM */

  if (sendmom)
    {
    /* if the NO_MOM_RELAY flag is set the calling function will call
       relay_to_mom so we do not need to do it here */
    if (flag != NO_MOM_RELAY)
      {
      /* The last number is unused unless this is an array */
      if ((rc = copy_batchrequest(&dup_req, preq, 0, -1)) != 0)
        {
        }
      /* The dup_req is freed in relay_to_mom (failure)
       * or in issue_Drequest (success) */
      else if ((rc = relay_to_mom(&pjob, dup_req, post_modify_req)))
        {
        if (pjob != NULL)
          {
          snprintf(log_buf,sizeof(log_buf),
            "Unable to relay information to mom for job '%s'\n",
            pjob->ji_qs.ji_jobid);
          
          log_err(rc, __func__, log_buf);
          }

        return(rc); /* unable to get to MOM */
        }
      }

    return(PBSE_RELAYED_TO_MOM);
    }

  if (copy_checkpoint_files)
    {
    struct batch_request *momreq = 0;
    momreq = cpy_checkpoint(momreq, pjob, JOB_ATR_checkpoint_name, CKPT_DIR_OUT);

    if (momreq != NULL)
      {
      /* have files to copy */
      momreq->rq_extra = strdup(pjob->ji_qs.ji_jobid);

      /* The momreq is freed in relay_to_mom (failure)
       * or in issue_Drequest (success) */
      if (checkpoint_req == CHK_HOLD)
        {
        rc = relay_to_mom(&pjob, momreq, chkpt_xfr_hold);
        }
      else
        {
        rc = relay_to_mom(&pjob, momreq, chkpt_xfr_done);
        }

      if (rc != 0)
        {
        if (pjob != NULL)
          {
          snprintf(log_buf,sizeof(log_buf),
            "Unable to relay information to mom for job '%s'\n",
            pjob->ji_qs.ji_jobid);
          
          log_err(rc, __func__, log_buf);
          }

        return(PBSE_NONE);  /* come back when mom replies */
        }
      }
    else
      {
      log_err(-1, __func__, "Failed to get batch request");
      }
    }

  return(PBSE_NONE);
  } /* END modify_job() */
コード例 #19
0
char *get_correct_spec_string(

  char *given,
  job  *pjob)

  {
  static char   id[] = "get_correct_spec_string";
  char     mode[20];
  char    *mode_string;
  char    *request;
  char    *correct_spec;
  char    *outer_plus;
  char    *plus;
  char    *one_req;
  int      num_gpu_reqs;
  char    *gpu_req;
  int      len;
  resource *pres;

  /* check to see if there is a gpus request. If so moab
   * sripted the mode request if it existed. We need to
   * put it back */
  mode_string = strstr(given, ":gpus=");

  if (mode_string != NULL)
    {
    /* Build our host list from what is in the job attrs */
    pres = find_resc_entry(
      &pjob->ji_wattr[(int)JOB_ATR_resource],
      find_resc_def(svr_resc_def, "neednodes", svr_resc_size));
    
    if (pres != NULL)
      {
      /* determine # of gpu requests in spec, we found 1 in given up above */
      num_gpu_reqs = 1;
      gpu_req = mode_string;
      while ((gpu_req = strstr(gpu_req + 1, ":gpus=")) != NULL)
        num_gpu_reqs++;

      /* assign gpu mode that was in "neednodes" */
      request = pres->rs_value.at_val.at_str;
      
      if ((request != NULL) && 
          (request[0] != 0))
        {
        gpu_req = strstr(request, ":gpus=");

        mode_string = gpu_req + strlen(":gpus=");
        while (isdigit(*mode_string))
          mode_string++;

        if (*mode_string == ':')
          {
          if (LOGLEVEL >= 7)
            {
            sprintf(log_buffer, "%s: job has %d gpu requests in node spec '%s'",
                    id,
                    num_gpu_reqs,
                    given);

            log_event(
              PBSEVENT_JOB,
              PBS_EVENTCLASS_JOB,
              pjob->ji_qs.ji_jobid,
              log_buffer);
            }

          if ((outer_plus = strchr(mode_string, '+')) != NULL)
            *outer_plus = '\0';

          /* 
           * The neednodes original value may have non gpu things in it, so we
           * can not rely on the requested gpu mode being the first item in the
           * the string after the gpus=x:.
           */

          if (strstr(mode_string, "exclusive_thread"))
            {
            strcpy(mode, ":exclusive_thread");
            }
          else if (strstr(mode_string, "exclusive_process"))
            {
            strcpy(mode, ":exclusive_process");
            }
          else if (strstr(mode_string, "exclusive"))
            {
            strcpy(mode, ":exclusive");
            }
          else if (strstr(mode_string, "default"))
            {
            strcpy(mode, ":default");
            }
          else if (strstr(mode_string, "shared"))
            {
            strcpy(mode, ":shared");
            }
          else
            {
            strcpy(mode, "");
            }

          if (outer_plus != NULL)
            *outer_plus = '+';

          /* now using the actual length of requested gpu mode */
          len = strlen(given) + 1 + (num_gpu_reqs * strlen(mode));
          if ((correct_spec = calloc(1, len)) != NULL)
            {
            one_req = given;
            
            while (one_req != NULL)
              {
              if ((plus = strchr(one_req, '+')) != NULL)
                {
                *plus = '\0';
                }
              
              strcat(correct_spec, one_req);
              if (strstr(one_req, ":gpus") != NULL)
                strcat(correct_spec, mode);
              
              if (plus != NULL)
                {
                strcat(correct_spec, "+");
                one_req = plus + 1;
                }
              else
                one_req = NULL;
              }
            }
          if ((LOGLEVEL >= 7) && (correct_spec != NULL) && (correct_spec[0] != '\0'))
            {
            sprintf(log_buffer, "%s: job gets adjusted gpu node spec of '%s'",
                    id,
                    correct_spec);

            log_event(
              PBSEVENT_JOB,
              PBS_EVENTCLASS_JOB,
              pjob->ji_qs.ji_jobid,
              log_buffer);
            }
          }
        else
          correct_spec = strdup(given);
        }
      }
    }
  else
    correct_spec = strdup(given);

  return(correct_spec);
  } /* get_correct_spec_string() */
コード例 #20
0
ファイル: mom_mach.c プロジェクト: A9-William/pbspro
/**
 * @brief
 *      Update the job attribute for resources used.
 *
 *      The first time this is called for a job, set up resource entries for
 *      each resource that can be reported for this machine.  Fill in the
 *      correct values.  Return an error code.
 *
 *      Assumes that the session ID attribute has already been set.
 *
 * @return int
 * @retval PBSE_NONE    for success.
 */
int 
mom_set_use(job *pjob)
{
	resource		*pres;
	attribute		*at;
	resource_def		*rd;
	u_Long 			*lp_sz, lnum_sz;
	unsigned long		*lp, lnum, oldcput;
	long			 dur;
	long                     ncpus_req;


	assert(pjob != NULL);
	at = &pjob->ji_wattr[(int)JOB_ATR_resc_used];
	assert(at->at_type == ATR_TYPE_RESC);

	if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_Suspend) != 0)
		return (PBSE_NONE);	/* job suspended, don't track it */

	DBPRT(("%s: entered %s\n", __func__, pjob->ji_qs.ji_jobid))

	at->at_flags |= ATR_VFLAG_MODIFY;
	if ((at->at_flags & ATR_VFLAG_SET) == 0) {
		at->at_flags |= ATR_VFLAG_SET;

		rd = find_resc_def(svr_resc_def, "ncpus", svr_resc_size);
		assert(rd != NULL);
		pres = add_resource_entry(at, rd);
		pres->rs_value.at_flags |= ATR_VFLAG_SET;
		pres->rs_value.at_type = ATR_TYPE_LONG;
		/*
		 * get pointer to list of resources *requested* for the job
		 * so the ncpus used can be set to ncpus requested
		 */
		at_req = &pjob->ji_wattr[(int)JOB_ATR_resource];
		assert(at->at_type == ATR_TYPE_RESC);

		pres_req = find_resc_entry(at_req, rd);
		if ((pres_req != NULL) &&
			((ncpus_req=pres_req->rs_value.at_val.at_long) !=0))
				pres->rs_value.at_val.at_long = ncpus_req;
		else
			pres->rs_value.at_val.at_long = 0;


		rd = find_resc_def(svr_resc_def, "cput", svr_resc_size);
		assert(rd != NULL);
		pres = add_resource_entry(at, rd);
		pres->rs_value.at_flags |= ATR_VFLAG_SET;
		pres->rs_value.at_type = ATR_TYPE_LONG;
		pres->rs_value.at_val.at_long = 0;

		rd = find_resc_def(svr_resc_def, "cpupercent", svr_resc_size);
		assert(rd != NULL);
		pres = add_resource_entry(at, rd);
		pres->rs_value.at_flags |= ATR_VFLAG_SET;
		pres->rs_value.at_type = ATR_TYPE_LONG;
		pres->rs_value.at_val.at_long = 0;

		rd = find_resc_def(svr_resc_def, "vmem", svr_resc_size);
		assert(rd != NULL);
		pres = add_resource_entry(at, rd);
		pres->rs_value.at_flags |= ATR_VFLAG_SET;
		pres->rs_value.at_type = ATR_TYPE_SIZE;
		pres->rs_value.at_val.at_size.atsv_shift = 10; /* in KB */
		pres->rs_value.at_val.at_size.atsv_units = ATR_SV_BYTESZ;

		rd = find_resc_def(svr_resc_def, "walltime", svr_resc_size);
		assert(rd != NULL);
		pres = add_resource_entry(at, rd);
		pres->rs_value.at_flags |= ATR_VFLAG_SET;
		pres->rs_value.at_type = ATR_TYPE_LONG;

		rd = find_resc_def(svr_resc_def, "mem", svr_resc_size);
		assert(rd != NULL);
		pres = add_resource_entry(at, rd);
		pres->rs_value.at_flags |= ATR_VFLAG_SET;
		pres->rs_value.at_type = ATR_TYPE_SIZE;
		pres->rs_value.at_val.at_size.atsv_shift = 10; /* in KB */
		pres->rs_value.at_val.at_size.atsv_units = ATR_SV_BYTESZ;
	}

	rd = find_resc_def(svr_resc_def, "cput", svr_resc_size);
	assert(rd != NULL);
	pres = find_resc_entry(at, rd);
	assert(pres != NULL);
	lp = (unsigned long *)&pres->rs_value.at_val.at_long;
	oldcput = *lp;
	lnum = MAX(*lp, cput_sum(pjob));
	*lp = lnum;

	/* now calculate weight moving average cpu usage percentage */

	if ((dur = sampletime_ceil+1 - pjob->ji_sampletim) > PBS_MIN_CPUPERCENT_PERIOD) {
		calc_cpupercent(pjob, oldcput, lnum, dur, at);
	}
	pjob->ji_sampletim = sampletime_floor;

	rd = find_resc_def(svr_resc_def, "vmem", svr_resc_size);
	assert(rd != NULL);
	pres = find_resc_entry(at, rd);
	assert(pres != NULL);
	lp_sz = &pres->rs_value.at_val.at_size.atsv_num;
	lnum_sz = (mem_sum(pjob) + 1023) >> 10;	/* as KB */
	*lp_sz = MAX(*lp_sz, lnum_sz);

	rd = find_resc_def(svr_resc_def, "walltime", svr_resc_size);
	assert(rd != NULL);
	pres = find_resc_entry(at, rd);
	assert(pres != NULL);
	pres->rs_value.at_val.at_long = (long)((double)(time_now - pjob->ji_qs.ji_stime) * wallfactor);

	rd = find_resc_def(svr_resc_def, "mem", svr_resc_size);
	assert(rd != NULL);
	pres = find_resc_entry(at, rd);
	assert(pres != NULL);
	lp_sz = &pres->rs_value.at_val.at_size.atsv_num;
	lnum_sz = (resi_sum(pjob) + 1023) >> 10;	/* in KB */
	*lp_sz = MAX(*lp_sz, lnum_sz);

	return (PBSE_NONE);
}
コード例 #21
0
static int assign_hosts(

  job  *pjob,           /* I (modified) */
  char *given,          /* I (optional) list of requested hosts */
  int   set_exec_host,  /* I (boolean) */
  char *FailHost,       /* O (optional,minsize=1024) */
  char *EMsg)           /* O (optional,minsize=1024) */

  {
  unsigned int  dummy;
  char  *list = NULL;
  char  *hosttoalloc = NULL;
  pbs_net_t  momaddr = 0;
  int   rc = 0, procs=0;
  extern char  *mom_host;
  char *to_free = NULL;
  resource *pres;


  if (EMsg != NULL)
    EMsg[0] = '\0';

  if (FailHost != NULL)
    FailHost[0] = '\0';

#ifdef __TREQSCHED
  if ((given == NULL) || (given[0] == '\0'))
    {
    /* scheduler must specify node allocation for all jobs */

    return(PBSE_UNKNODEATR);
    }

#endif /* __TREQSCHED */

  if ((given != NULL) && (given[0] != '\0'))
    {
#ifdef NVIDIA_GPUS
    hosttoalloc = get_correct_spec_string(given, pjob);
    to_free = hosttoalloc;
#else
    /* assign what was specified in run request */
    hosttoalloc = given;
#endif
    }
  else
    {
    /* Build our host list from what is in the job attrs */
    pres = find_resc_entry(
             &pjob->ji_wattr[(int)JOB_ATR_resource],
             find_resc_def(svr_resc_def, "neednodes", svr_resc_size));

    if (pres != NULL)
      {
      /* assign what was in "neednodes" */

      hosttoalloc = pres->rs_value.at_val.at_str;

      if ((hosttoalloc == NULL) || (hosttoalloc[0] == '\0'))
        {
        return(PBSE_UNKNODEATR);
        }
      }

    pres = find_resc_entry(
             &pjob->ji_wattr[(int)JOB_ATR_resource],
             find_resc_def(svr_resc_def, "procs", svr_resc_size));

    if (pres != NULL)
      {
      /* assign what was in "neednodes" */

      procs = pres->rs_value.at_val.at_long;

      if ((hosttoalloc == NULL) || (hosttoalloc[0] == '\0'))
        {
        return(PBSE_UNKNODEATR);
        }
      }
    }

  if (hosttoalloc != NULL)
    {
    /* NO-OP */
    }
  else if (svr_totnodes == 0)
    {
    /* assign "local" */

    if ((server.sv_attr[(int)SRV_ATR_DefNode].at_flags & ATR_VFLAG_SET) &&
        (server.sv_attr[(int)SRV_ATR_DefNode].at_val.at_str != NULL))
      {
      hosttoalloc = server.sv_attr[(int)SRV_ATR_DefNode].at_val.at_str;
      }
    else
      {
      hosttoalloc = mom_host;
      momaddr = pbs_mom_addr;
      }
    }
  else if ((server.sv_attr[(int)SRV_ATR_DefNode].at_flags & ATR_VFLAG_SET) &&
           (server.sv_attr[(int)SRV_ATR_DefNode].at_val.at_str != 0))
    {
    /* alloc server default_node */

    hosttoalloc = server.sv_attr[(int)SRV_ATR_DefNode].at_val.at_str;
    }
  else if (svr_tsnodes != 0)
    {
    /* find first time-shared node */

    if ((hosttoalloc = find_ts_node()) == NULL)
      {
      /* FAILURE */

      return(PBSE_NOTSNODE);
      }
    }
  else
    {
    /* fall back to 1 cluster node */

    hosttoalloc = PBS_DEFAULT_NODE;
    }

  /* do we need to allocate the (cluster) node(s)? */

  if (svr_totnodes != 0)
    {
    if ((rc = is_ts_node(hosttoalloc)) != 0)
      {
      rc = set_nodes(pjob, hosttoalloc, procs, &list, FailHost, EMsg);

      set_exec_host = 1; /* maybe new VPs, must set */

      hosttoalloc = list;
      }
    }

  if (rc == 0)
    {
    /* set_nodes succeeded */

    if (set_exec_host != 0)
      {
      job_attr_def[(int)JOB_ATR_exec_host].at_free(
        &pjob->ji_wattr[(int)JOB_ATR_exec_host]);

      job_attr_def[(int)JOB_ATR_exec_host].at_decode(
        &pjob->ji_wattr[(int)JOB_ATR_exec_host],
        NULL,
        NULL,
        hosttoalloc);  /* O */

      pjob->ji_modified = 1;
      }
    else
      {
      /* leave exec_host alone and reuse old IP address */
      momaddr = pjob->ji_qs.ji_un.ji_exect.ji_momaddr;

      hosttoalloc = pjob->ji_wattr[(int)JOB_ATR_exec_host].at_val.at_str;
      }

    strncpy(

      pjob->ji_qs.ji_destin,
      parse_servername(hosttoalloc, &dummy),
      PBS_MAXROUTEDEST);

    if (momaddr == 0)
      {
      momaddr = get_hostaddr(pjob->ji_qs.ji_destin);

      if (momaddr == 0)
        {
        free_nodes(pjob);

        if (list != NULL)
          free(list);

        sprintf(log_buffer, "ALERT:  job cannot allocate node '%s' (could not determine IP address for node)",
                pjob->ji_qs.ji_destin);

        log_event(
          PBSEVENT_JOB,
          PBS_EVENTCLASS_JOB,
          pjob->ji_qs.ji_jobid,
          log_buffer);

        if (to_free != NULL)
          free(to_free);

        return(PBSE_BADHOST);
        }
      }

    pjob->ji_qs.ji_un.ji_exect.ji_momaddr = momaddr;
    }  /* END if (rc == 0) */

  if (list != NULL)
    free(list);
        
  if (to_free != NULL)
    free(to_free);

  return(rc);
  }  /* END assign_hosts() */
コード例 #22
0
ファイル: prolog.c プロジェクト: CESNET/torque
int run_pelog(

  int   which,      /* I (one of PE_*) */
  char *specpelog,  /* I - script path */
  job  *pjob,       /* I - associated job */
  int   pe_io_type) /* I */

  {
  char *id = "run_pelog";

  struct sigaction act, oldact;
  char *arg[12];
  int   fds1 = 0;
  int   fds2 = 0;
  int   fd_input;
  char  resc_list[2048];
  char  resc_used[2048];

  struct stat sbuf;
  char   sid[20];
  char   exit_stat[11];
  int    waitst;
  int    isjoined;  /* boolean */
  char   buf[MAXPATHLEN + 1024];
  char   pelog[MAXPATHLEN + 1024];

  int    jobtypespecified = 0;

  resource      *r;

  char          *EmptyString = "";

  int            LastArg;
  int            aindex;

  int            rc;

  char          *ptr;

  if ((pjob == NULL) || (specpelog == NULL) || (specpelog[0] == '\0'))
    {
    return(0);
    }

  ptr = pjob->ji_wattr[(int)JOB_ATR_jobtype].at_val.at_str;

  if (ptr != NULL)
    {
    jobtypespecified = 1;

    snprintf(pelog,sizeof(pelog),"%s.%s",
      specpelog,
      ptr);
    }
  else
    {
    strncpy(pelog,specpelog,sizeof(pelog));
    }

  rc = stat(pelog,&sbuf);

  if ((rc == -1) && (jobtypespecified == 1))
    {
    strncpy(pelog,specpelog,sizeof(pelog));

    rc = stat(pelog,&sbuf);
    }

  if (rc == -1)
    {
    if (errno == ENOENT || errno == EBADF)
      {
      /* epilog/prolog script does not exist */

      if (LOGLEVEL >= 5)
        {
        static char tmpBuf[1024];

        sprintf(log_buffer, "%s script '%s' for job %s does not exist (cwd: %s,pid: %d)",
          PPEType[which],
          (pelog != NULL) ? pelog : "NULL",
          (pjob != NULL) ? pjob->ji_qs.ji_jobid : "NULL",
          getcwd(tmpBuf, sizeof(tmpBuf)),
          (int)getpid());

        log_record(PBSEVENT_SYSTEM, 0, id, log_buffer);
        }

#ifdef ENABLE_CSA
      if ((which == PE_EPILOGUSER) && (!strcmp(pelog, path_epiloguser)))
        {
        /*
          * Add a workload management end record
        */
        if (LOGLEVEL >= 8)
          {
          sprintf(log_buffer, "%s calling add_wkm_end from run_pelog() - no user epilog",
            pjob->ji_qs.ji_jobid);

          log_err(-1, id, log_buffer);
          }

        add_wkm_end(pjob->ji_wattr[(int)JOB_ATR_pagg_id].at_val.at_ll,

                    pjob->ji_qs.ji_un.ji_momt.ji_exitstat, pjob->ji_qs.ji_jobid);
        }

#endif /* ENABLE_CSA */

      return(0);
      }

    return(pelog_err(pjob,pelog,errno,"cannot stat"));
    }

  if (LOGLEVEL >= 5)
    {
    sprintf(log_buffer,"running %s script '%s' for job %s",
      PPEType[which],
      (pelog != NULL) ? pelog : "NULL",
      pjob->ji_qs.ji_jobid);

    log_ext(-1,id,log_buffer,LOG_DEBUG);  /* not actually an error--but informational */
    }

  /* script must be owned by root, be regular file, read and execute by user *
   * and not writeable by group or other */

  if(which == PE_PROLOGUSERJOB || which == PE_EPILOGUSERJOB)
    {
    if ((sbuf.st_uid != pjob->ji_qs.ji_un.ji_momt.ji_exuid) ||
        (!S_ISREG(sbuf.st_mode)) ||
        ((sbuf.st_mode & (S_IRUSR | S_IXUSR)) != (S_IRUSR | S_IXUSR)) ||
        (sbuf.st_mode & (S_IWGRP | S_IWOTH)))
      {
      return(pelog_err(pjob,pelog,-1,"permission Error"));
      }
    }
  else if ((sbuf.st_uid != 0) ||
      (!S_ISREG(sbuf.st_mode)) ||
      ((sbuf.st_mode & (S_IRUSR | S_IXUSR)) != (S_IRUSR | S_IXUSR)) ||
      (sbuf.st_mode & (S_IWGRP | S_IWOTH)))
    {
    return(pelog_err(pjob,pelog,-1,"permission Error"));
    }

  if ((which == PE_PROLOGUSER) || (which == PE_EPILOGUSER))
    {
    /* script must also be read and execute by other */

    if ((sbuf.st_mode & (S_IROTH | S_IXOTH)) != (S_IROTH | S_IXOTH))
      {
      return(pelog_err(pjob, pelog, -1, "permission Error"));
      }
    }

  fd_input = pe_input(pjob->ji_qs.ji_jobid);

  if (fd_input < 0)
    {
    return(pelog_err(pjob, pelog, -2, "no pro/epilogue input file"));
    }

  run_exit = 0;

  child = fork();

  if (child > 0)
    {
    int KillSent = FALSE;

    /* parent - watch for prolog/epilog to complete */

    close(fd_input);

    act.sa_handler = pelogalm;

    sigemptyset(&act.sa_mask);

    act.sa_flags = 0;

    sigaction(SIGALRM, &act, &oldact);

    /* it would be nice if the harvest routine could block for 5 seconds,
       and if the prolog is not complete in that time, mark job as prolog
       pending, append prolog child, and continue */

    /* main loop should attempt to harvest prolog in non-blocking mode.
       If unsuccessful after timeout, job should be terminated, and failure
       reported.  If successful, mom should unset prolog pending, and
       continue with job start sequence.  Mom should report job as running
       while prologpending flag is set.  (NOTE:  must track per job prolog
       start time)
    */

    alarm(pe_alarm_time);

    while (waitpid(child, &waitst, 0) < 0)
      {
      if (errno != EINTR)
        {
        /* exit loop. non-alarm based failure occurred */

        run_exit = -3;

        MOMPrologFailureCount++;

        break;
        }

      if (run_exit == -4)
        {
        if (KillSent == FALSE)
          {
          MOMPrologTimeoutCount++;

          /* timeout occurred */

          KillSent = TRUE;

          /* NOTE:  prolog/epilog may be locked in KERNEL space and unkillable */

          alarm(5);
          }
        else
          {
          /* cannot kill prolog/epilog, give up */

          run_exit = -5;

          break;
          }
        }
      }    /* END while (wait(&waitst) < 0) */

    /* epilog/prolog child completed */
#ifdef ENABLE_CSA
    if ((which == PE_EPILOGUSER) && (!strcmp(pelog, path_epiloguser)))
      {
      /*
       * Add a workload management end record
      */
      if (LOGLEVEL >= 8)
        {
        sprintf(log_buffer, "%s calling add_wkm_end from run_pelog() - after user epilog",
                pjob->ji_qs.ji_jobid);

        log_err(-1, id, log_buffer);
        }

      add_wkm_end(pjob->ji_wattr[(int)JOB_ATR_pagg_id].at_val.at_ll,

                  pjob->ji_qs.ji_un.ji_momt.ji_exitstat, pjob->ji_qs.ji_jobid);
      }

#endif /* ENABLE_CSA */

    alarm(0);

    /* restore the previous handler */

    sigaction(SIGALRM, &oldact, 0);

    if (run_exit == 0)
      {
      if (WIFEXITED(waitst))
        {
        run_exit = WEXITSTATUS(waitst);
        }
      }
    }
  else
    {
    /* child - run script */

    log_close(0);

    if (lockfds >= 0)
      {
      close(lockfds);

      lockfds = -1;
      }

    net_close(-1);

    if ((which == PE_PROLOGUSER) || (which == PE_EPILOGUSER) || (which == PE_PROLOGUSERJOB) || which == PE_EPILOGUSERJOB)
      {
      if (setgroups(
          pjob->ji_grpcache->gc_ngroup,
          (gid_t *)pjob->ji_grpcache->gc_groups) != 0)
        {
        snprintf(log_buffer,sizeof(log_buffer),
          "setgroups() for UID = %lu failed: %s\n",
          (unsigned long)pjob->ji_qs.ji_un.ji_momt.ji_exuid,
          strerror(errno));

        log_err(errno, id, log_buffer);

        exit(255);
        }

      if (setgid(pjob->ji_qs.ji_un.ji_momt.ji_exgid) != 0)
        {
        snprintf(log_buffer,sizeof(log_buffer),
          "setgid(%lu) for UID = %lu failed: %s\n",
          (unsigned long)pjob->ji_qs.ji_un.ji_momt.ji_exgid,
          (unsigned long)pjob->ji_qs.ji_un.ji_momt.ji_exuid,
          strerror(errno));

        log_err(errno, id, log_buffer);

        exit(255);
        }

      if (setuid(pjob->ji_qs.ji_un.ji_momt.ji_exuid) != 0)
        {
        snprintf(log_buffer,sizeof(log_buffer),
          "setuid(%lu) failed: %s\n",
          (unsigned long)pjob->ji_qs.ji_un.ji_momt.ji_exuid,
          strerror(errno));

        log_err(errno, id, log_buffer);

        exit(255);
        }
      }

    if (fd_input != 0)
      {
      close(0);

      if (dup(fd_input) == -1) {}

      close(fd_input);
      }

    if (pe_io_type == PE_IO_TYPE_NULL)
      {
      /* no output, force to /dev/null */

      fds1 = open("/dev/null", O_WRONLY, 0600);
      fds2 = open("/dev/null", O_WRONLY, 0600);
      }
    else if (pe_io_type == PE_IO_TYPE_STD)
      {
      /* open job standard out/error */

      /*
       * We need to know if files are joined or not.
       * If they are then open the correct file and duplicate it to the other
      */

      isjoined = is_joined(pjob);

      switch (isjoined)
        {
        case -1:

          fds2 = open_std_file(pjob, StdErr, O_WRONLY | O_APPEND,
                               pjob->ji_qs.ji_un.ji_momt.ji_exgid);

          fds1 = dup(fds2);

          break;

        case 1:

          fds1 = open_std_file(pjob, StdOut, O_WRONLY | O_APPEND,
                               pjob->ji_qs.ji_un.ji_momt.ji_exgid);

          fds2 = dup(fds1);

          break;

        default:

          fds1 = open_std_file(pjob, StdOut, O_WRONLY | O_APPEND,
                               pjob->ji_qs.ji_un.ji_momt.ji_exgid);

          fds2 = open_std_file(pjob, StdErr, O_WRONLY | O_APPEND,
                               pjob->ji_qs.ji_un.ji_momt.ji_exgid);
          break;
        }
      }

    if (pe_io_type != PE_IO_TYPE_ASIS)
      {
      /* If PE_IO_TYPE_ASIS, leave as is, already open to job */

      if (fds1 != 1)
        {
        close(1);

        if (dup(fds1) == -1) {}

        close(fds1);
        }

      if (fds2 != 2)
        {
        close(2);

        if (dup(fds2) == -1) {}

        close(fds2);
        }
      }

    if ((which == PE_PROLOGUSER) || (which == PE_EPILOGUSER) || (which == PE_PROLOGUSERJOB) || (which == PE_EPILOGUSERJOB))
      {
      if (chdir(pjob->ji_grpcache->gc_homedir) != 0)
        {
        /* warn only, no failure */

        sprintf(log_buffer,
          "PBS: chdir to %s failed: %s (running user %s in current directory)",
          pjob->ji_grpcache->gc_homedir,
          strerror(errno),
          which == PE_PROLOGUSER ? "prologue" : "epilogue");

        if (write(2, log_buffer, strlen(log_buffer)) == -1) {}

        fsync(2);
        }
      }

    /* for both prolog and epilog */

    if (DEBUGMODE == 1)
      {
      fprintf(stderr, "PELOGINFO:  script:'%s'  jobid:'%s'  euser:'******'  egroup:'%s'  jobname:'%s' SSID:'%ld'  RESC:'%s'\n",
              pelog,
              pjob->ji_qs.ji_jobid,
              pjob->ji_wattr[(int)JOB_ATR_euser].at_val.at_str,
              pjob->ji_wattr[(int)JOB_ATR_egroup].at_val.at_str,
              pjob->ji_wattr[(int)JOB_ATR_jobname].at_val.at_str,
              pjob->ji_wattr[(int)JOB_ATR_session_id].at_val.at_long,
              resc_to_string(pjob, (int)JOB_ATR_resource, resc_list, sizeof(resc_list)));
      }

    arg[0] = pelog;

    arg[1] = pjob->ji_qs.ji_jobid;
    arg[2] = pjob->ji_wattr[(int)JOB_ATR_euser].at_val.at_str;
    arg[3] = pjob->ji_wattr[(int)JOB_ATR_egroup].at_val.at_str;
    arg[4] = pjob->ji_wattr[(int)JOB_ATR_jobname].at_val.at_str;

    set_resource_vars(pjob,NULL);

    /* NOTE:  inside child */

    if ( which == PE_EPILOG || which == PE_EPILOGUSER || which == PE_EPILOGUSERJOB )
      {
      /* for epilog only */

      sprintf(sid, "%ld",
              pjob->ji_wattr[(int)JOB_ATR_session_id].at_val.at_long);
      sprintf(exit_stat,"%d",
              pjob->ji_qs.ji_un.ji_exect.ji_exitstat);

      arg[5] = sid;
      arg[6] = resc_to_string(pjob, (int)JOB_ATR_resource, resc_list, sizeof(resc_list));
      arg[7] = resc_to_string(pjob, (int)JOB_ATR_resc_used, resc_used, sizeof(resc_used));
      arg[8] = pjob->ji_wattr[(int)JOB_ATR_in_queue].at_val.at_str;
      arg[9] = pjob->ji_wattr[(int)JOB_ATR_account].at_val.at_str;
      arg[10] = exit_stat;
      arg[11] = NULL;

      LastArg = 11;
      }
    else if (which == PE_MAGRATHEA)
      {
      char *cc = NULL, *c = NULL;

      setenv("MAGRATHEA_CLUSTER",pjob->ji_wattr[(int)JOB_ATR_jobname].at_val.at_str,1);

      if ((pjob->ji_wattr[(int)JOB_ATR_cloud_mapping].at_flags & ATR_VFLAG_SET) &&
          (pjob->ji_wattr[(int)JOB_ATR_cloud_mapping].at_val.at_str))
        {
        c = cloud_mom_mapping(pjob->ji_wattr[(int)JOB_ATR_cloud_mapping].at_val.at_str,mom_host,&cc);
        }

      if (c)
        arg[5]=c;
      else
        arg[5]=mom_host;

      setenv("MAGRATHEA_VIRTUAL_HOST",arg[5],1);

      if (cc)
        {
        setenv("MAGRATHEA_VIRTUAL_ALTERNATIVE",cc,1);
        free(cc);
        }

      if (pjob->ji_wattr[(int)JOB_ATR_vlan_id].at_val.at_str != NULL )
        {
        setenv("MAGRATHEA_VLANID",pjob->ji_wattr[(int)JOB_ATR_vlan_id].at_val.at_str,1);
        }

      switch (is_cloud_job(pjob))
        {
        case 1: setenv("MAGRATHEA_TYPE","create",1); break;
        case 2: setenv("MAGRATHEA_TYPE","internal",1); break;
        default: setenv("MAGRATHEA_TYPE","none",1); break;
        }


      arg[6]=(char *)0;
      LastArg = 6;
      }
    else
      {
      /* prolog */

      arg[5] = resc_to_string(pjob, (int)JOB_ATR_resource, resc_list, sizeof(resc_list));
      arg[6] = pjob->ji_wattr[(int)JOB_ATR_in_queue].at_val.at_str;
      arg[7] = pjob->ji_wattr[(int)JOB_ATR_account].at_val.at_str;
      arg[8] = NULL;

      LastArg = 8;
      }

    for (aindex = 0;aindex < LastArg;aindex++)
      {
      if (arg[aindex] == NULL)
        arg[aindex] = EmptyString;
      }  /* END for (aindex) */

    /*
     * Pass Resource_List.nodes request in environment
     * to allow pro/epi-logue setup/teardown of system
     * settings.  --pw, 2 Jan 02
     * Fixed to use putenv for sysV compatibility.
     *  --troy, 11 jun 03
     *
     */

    r = find_resc_entry(
          &pjob->ji_wattr[(int)JOB_ATR_resource],
          find_resc_def(svr_resc_def, "nodes", svr_resc_size));

    if (r != NULL)
      {
      /* setenv("PBS_RESOURCE_NODES",r->rs_value.at_val.at_str,1); */

      const char *envname = "PBS_RESOURCE_NODES=";
      char *envstr;

      envstr = malloc(
                 (strlen(envname) + strlen(r->rs_value.at_val.at_str) + 1) * sizeof(char));

      if (envstr != NULL)
        {
        strcpy(envstr,envname);

        strcat(envstr,r->rs_value.at_val.at_str);

        /* do _not_ free the string when using putenv */

        putenv(envstr);
        }
      }  /* END if (r != NULL) */

    r = find_resc_entry(
          &pjob->ji_wattr[(int)JOB_ATR_resource],
          find_resc_def(svr_resc_def, "gres", svr_resc_size));

    if (r != NULL)
      {
      /* setenv("PBS_RESOURCE_NODES",r->rs_value.at_val.at_str,1); */

      const char *envname = "PBS_RESOURCE_GRES=";
      char *envstr;

      envstr = malloc(
                 (strlen(envname) + strlen(r->rs_value.at_val.at_str) + 1) * sizeof(char));

      if (envstr != NULL)
        {
        strcpy(envstr,envname);

        strcat(envstr,r->rs_value.at_val.at_str);

        /* do _not_ free the string when using putenv */

        putenv(envstr);
        }
      }  /* END if (r != NULL) */

    if (TTmpDirName(pjob, buf))
      {
      const char *envname = "TMPDIR=";
      char *envstr;

      envstr = malloc(
                 (strlen(envname) + strlen(buf) + 1) * sizeof(char));

      if (envstr != NULL)
        {
        strcpy(envstr,envname);

        strcat(envstr,buf);

        /* do _not_ free the string when using putenv */

        putenv(envstr);
        }
      }  /* END if (TTmpDirName(pjob,&buf)) */

    /* Set PBS_SCHED_HINT */

      {
      char *envname = "PBS_SCHED_HINT";
      char *envval;
      char *envstr;

      if ((envval = get_job_envvar(pjob, envname)) != NULL)
        {
        envstr = malloc((strlen(envname) + strlen(envval) + 2) * sizeof(char));

        if (envstr != NULL)
          {
          sprintf(envstr,"%s=%s",
            envname,
            envval);

          putenv(envstr);
          }
        }
      }

    /* Set PBS_NODENUM */
      {
      char *envname = "PBS_NODENUM";
      char *envstr;

      sprintf(buf, "%d",
        pjob->ji_nodeid);

      envstr = malloc((strlen(envname) + strlen(buf) + 2) * sizeof(char));

      if (envstr != NULL)
        {
        sprintf(envstr,"%s=%d",
          envname,
          pjob->ji_nodeid);

        putenv(envstr);
        }
      }

    /* Set PBS_MSHOST */
      {
      char *envname = "PBS_MSHOST";
      char *envstr;

      if ((pjob->ji_vnods[0].vn_host != NULL) && (pjob->ji_vnods[0].vn_host->hn_host != NULL))
        {
        envstr = malloc((strlen(envname) + strlen(pjob->ji_vnods[0].vn_host->hn_host) + 2) * sizeof(char));

        if (envstr != NULL)
          {
          sprintf(envstr,"%s=%s",
            envname,
            pjob->ji_vnods[0].vn_host->hn_host);

          putenv(envstr);
          }
        }
      }

    /* Set PBS_NODEFILE */
      {
      char *envname = "PBS_NODEFILE";
      char *envstr;

      if (pjob->ji_flags & MOM_HAS_NODEFILE)
        {
        sprintf(buf, "%s/%s",
          path_aux,
          pjob->ji_qs.ji_jobid);

        envstr = malloc((strlen(envname) + strlen(buf) + 2) * sizeof(char));

        if (envstr != NULL)
          {
          sprintf(envstr,"%s=%s",
            envname,
            buf);

          putenv(envstr);
          }
        }
      }

    /* Set umask */
    if (pjob->ji_wattr[(int)JOB_ATR_umask].at_flags & ATR_VFLAG_SET)
      {
      char *buf = calloc(strlen("PBS_UMASK=")+16,1);
      if (buf != NULL)
        {
        sprintf(buf,"PBS_UMASK=%#o",pjob->ji_wattr[(int)JOB_ATR_umask].at_val.at_long);
        putenv(buf);
        }
      }

    /* Set PBS_O_Workdir */
      {
      char *envname = "PBS_O_WORKDIR";
      char *workdir_val;
      char *envstr;

      workdir_val = get_job_envvar(pjob,envname);
      if (workdir_val != NULL)
        {
        envstr = malloc((strlen(workdir_val) + strlen(envname) + 2) * sizeof(char));

        if (envstr != NULL)
          {
          sprintf(envstr,"%s=%s",
            envname,
            workdir_val);

          putenv(envstr);
          }
        }
      }

    /* SET BEOWULF_JOB_MAP */

      {

      struct array_strings *vstrs;

      int VarIsSet = 0;
      int j;

      vstrs = pjob->ji_wattr[(int)JOB_ATR_variables].at_val.at_arst;

      for (j = 0;j < vstrs->as_usedptr;++j)
        {
        if (!strncmp(
              vstrs->as_string[j],
              "BEOWULF_JOB_MAP=",
              strlen("BEOWULF_JOB_MAP=")))
          {
          VarIsSet = 1;

          break;
          }
        }

      if (VarIsSet == 1)
        {
        char *envstr;

        envstr = malloc((strlen(vstrs->as_string[j])) * sizeof(char));

        if (envstr != NULL)
          {
          strcpy(envstr,vstrs->as_string[j]);

          putenv(envstr);
          }
        }
      }

    execv(pelog,arg);

    sprintf(log_buffer,"execv of %s failed: %s\n",
      pelog,
      strerror(errno));

    if (write(2, log_buffer, strlen(log_buffer)) == -1) 
      {
      /* cannot write message to stderr */

      /* NO-OP */
      }

    fsync(2);

    exit(255);
    }  /* END else () */

  switch (run_exit)
    {
    case 0:

      /* SUCCESS */

      /* NO-OP */

      break;

    case - 3:

      pelog_err(pjob, pelog, run_exit, "child wait interrupted");

      break;

    case - 4:

      pelog_err(pjob, pelog, run_exit, "prolog/epilog timeout occurred, child cleaned up");

      break;

    case - 5:

      pelog_err(pjob, pelog, run_exit, "prolog/epilog timeout occurred, cannot kill child");

      break;

    default:

      pelog_err(pjob, pelog, run_exit, "nonzero p/e exit status");

      break;
    }  /* END switch (run_exit) */

  return(run_exit);
  }  /* END run_pelog() */
コード例 #23
0
int set_node_ct(

  resource      *pnodesp,  /* I */
  pbs_attribute *pattr,    /* I */
  int            actmode)  /* I */

  {
  resource *pnct;
  resource_def *pndef;
  resource *ppct = NULL;
  resource_def *ppdef;
  resource *pprocsp;
  resource_def *pprocsdef;

  if (actmode == ATR_ACTION_RECOV)
    {
    /* SUCCESS */

    return(0);
    }
  
  // WARNING: we are potentially re-sizing the vector in the calls to add_resource_entry()
  // below. All attempts to use pnodesp after the calls to add_resource_entry may be using an
  // invalid pointer, so copy data here
  std::string nodes_val(pnodesp->rs_value.at_val.at_str);

  /* Set "nodect" to count of nodes in "nodes" */

  pndef = find_resc_def(svr_resc_def, "nodect", svr_resc_size);

  if (pndef == NULL)
    {
    return(PBSE_SYSTEM);
    }

  if ((pnct = find_resc_entry(pattr, pndef)) == NULL)
    {
    if ((pnct = add_resource_entry(pattr, pndef)) == NULL)
      {
      return(PBSE_SYSTEM);
      }
    }

  pnct->rs_value.at_val.at_long = ctnodes(nodes_val.c_str());

  pnct->rs_value.at_flags |= ATR_VFLAG_SET;

  /* Set "neednodes" to "nodes", may be altered by scheduler */

  pndef = find_resc_def(svr_resc_def, "neednodes", svr_resc_size);

  if (pndef == NULL)
    {
    return(PBSE_SYSTEM);
    }

  if ((pnct = find_resc_entry(pattr, pndef)) == NULL)
    {
    if ((pnct = add_resource_entry(pattr, pndef)) == NULL)
      {
      return(PBSE_SYSTEM);
      }
    }
  else
    {
    pndef->rs_free(&pnct->rs_value);
    }

  pndef->rs_decode(&pnct->rs_value, NULL, NULL, nodes_val.c_str(), ATR_DFLAG_ACCESS);

  pnct->rs_value.at_flags |= ATR_VFLAG_SET;

  /* SUCCESS nodect */

  /* set "procct" to count of processors in "nodes" plus "procs" */

  ppdef = find_resc_def(svr_resc_def, "procct", svr_resc_size);

  if (ppdef == NULL)
    {
    return(PBSE_SYSTEM);
    }

  if ((ppct = find_resc_entry(pattr, ppdef)) == NULL)
    {
    if ((ppct = add_resource_entry(pattr, ppdef)) == 0)
      {
      return(PBSE_SYSTEM);
      }
    }

  pprocsdef = find_resc_def(svr_resc_def, "procs", svr_resc_size);
  if (pprocsdef == NULL)
    {
    return(PBSE_SYSTEM);
    }

  if ((pprocsp = find_resc_entry(pattr, pprocsdef)) == NULL)
    {
    ppct->rs_value.at_val.at_long = count_proc(nodes_val.c_str());
    }
  else
    { 
    ppct->rs_value.at_val.at_long = pprocsp->rs_value.at_val.at_long;
    ppct->rs_value.at_val.at_long += count_proc(nodes_val.c_str());
    }

  ppct->rs_value.at_flags |= ATR_VFLAG_SET;

  /* SUCCESS procct */

  return(0);
  }  /* END set_node_ct() */
コード例 #24
0
ファイル: resc_def_all.c プロジェクト: msbritt/torque
int init_resc_defs(void)

  {
  int                   rindex = 0;
  int                   dindex = 0;
  int                   unkindex = 0;
#ifndef PBS_MOM

  resource_def         *tmpresc = NULL;
  struct array_strings *resc_arst = NULL;
  char                 *extra_resc;
  int                   resc_num = 0;
#endif

  svr_resc_size = sizeof(svr_resc_def_const) / sizeof(resource_def);

#ifndef PBS_MOM
  /* build up a temporary list of string resources */

  if (get_svr_attr_arst(SRV_ATR_ExtraResc, &resc_arst) == PBSE_NONE)
    {

    tmpresc = (resource_def *)calloc(resc_arst->as_usedptr + 1, sizeof(resource_def));

    if (tmpresc == NULL)
      {
      return(-1);
      }

    for (resc_num = 0;resc_num < resc_arst->as_usedptr;resc_num++)
      {
      extra_resc = resc_arst->as_string[resc_num];

      (tmpresc + resc_num)->rs_name = strdup(extra_resc);
      (tmpresc + resc_num)->rs_decode = decode_str;
      (tmpresc + resc_num)->rs_encode = encode_str;
      (tmpresc + resc_num)->rs_set = set_str;
      (tmpresc + resc_num)->rs_comp = comp_str;
      (tmpresc + resc_num)->rs_free = free_str;
      (tmpresc + resc_num)->rs_action = NULL_FUNC;
      (tmpresc + resc_num)->rs_flags = READ_WRITE;
      (tmpresc + resc_num)->rs_type = ATR_TYPE_STR;

      dindex++;

      }
    }

#endif

  svr_resc_def = (resource_def *)calloc(svr_resc_size + dindex, sizeof(resource_def));

  if (svr_resc_def == NULL)
     {
#ifndef PBS_MOM
     if (tmpresc != NULL)
       free(tmpresc);
#endif
     return(-1);
     }

  /* copy all const resources, except for the last "unknown" */
  for (rindex = 0; rindex < (svr_resc_size - 1); rindex++)
    {
    memcpy(svr_resc_def + rindex, svr_resc_def_const + rindex, sizeof(resource_def));
    }

  unkindex = rindex;

#ifndef PBS_MOM
  /* copy our dynamic resources */
  if (tmpresc)
    {
    for (dindex = 0; (tmpresc + dindex)->rs_decode; dindex++)
      {
      if (find_resc_def(svr_resc_def, (tmpresc + dindex)->rs_name, rindex) == NULL)
        {
        memcpy(svr_resc_def + rindex, tmpresc + dindex, sizeof(resource_def));
        rindex++;
        }
      }

    free(tmpresc);
    }
#endif

  /* copy the last "unknown" resource */
  memcpy(svr_resc_def + rindex, svr_resc_def_const + unkindex, sizeof(resource_def));

  svr_resc_size = rindex + 1;

  return(PBSE_NONE);
  } /* END init_resc_defs() */
コード例 #25
0
ファイル: attr_fn_resc.c プロジェクト: gto11520/torque
int decode_resc(

  pbs_attribute *patr,  /* Modified on Return */
  char          *name,  /* pbs_attribute name */
  char          *rescn, /* I resource name - is used here */
  char          *val,   /* resource value */
  int            perm)  /* access permissions */

  {
  resource *prsc;
  resource_def *prdef;
  int   rc = 0;
  int   rv;

  if (patr == NULL)
    {
    return(PBSE_INTERNAL);
    }

  if (rescn == NULL)
    {
    return(PBSE_UNKRESC);
    }

  if (!(patr->at_flags & ATR_VFLAG_SET))
    CLEAR_HEAD(patr->at_val.at_list);

  prdef = find_resc_def(svr_resc_def, rescn, svr_resc_size);

  if (prdef == NULL)
    {
    /*
     * didn't find resource with matching name, use unknown;
     * but return PBSE_UNKRESC in case caller doesn`t wish to
     * accept unknown resources
     */

    rc = PBSE_UNKRESC;

    prdef = svr_resc_def + (svr_resc_size - 1);
    }

  prsc = find_resc_entry(patr, prdef);

  if (prsc == NULL) /* no current resource entry, add it */

    if ((prsc = add_resource_entry(patr, prdef)) == NULL)
      {
      return(PBSE_SYSTEM);
      }

  /* note special use of ATR_DFLAG_ACCESS, see server/attr_recov() */

  if (((prsc->rs_defin->rs_flags & perm & ATR_DFLAG_WRACC) == 0) &&
      (perm != ATR_DFLAG_ACCESS))
    {
    return(PBSE_ATTRRO);
    }

  patr->at_flags |= ATR_VFLAG_SET | ATR_VFLAG_MODIFY;

  rv = prdef->rs_decode(&prsc->rs_value, name, rescn, val, perm);

  if (rv == 0)
    {
    /* FAILURE */

    return(rc);
    }

  /* SUCCESS */

  return(rv);
  }