Esempio n. 1
0
void req_commit(

  struct batch_request *preq)  /* I */

  {
  job   *pj;

  pj = locate_new_job(preq->rq_conn, preq->rq_ind.rq_commit);

  if (LOGLEVEL >= 6)
    {
    log_record(
      PBSEVENT_JOB,
      PBS_EVENTCLASS_JOB,
      (pj != NULL) ? pj->ji_qs.ji_jobid : "NULL",
      "committing job");
    }

  if (pj == NULL)
    {
    req_reject(PBSE_UNKJOBID, 0, preq, NULL, NULL);

    return;
    }

  if (pj->ji_qs.ji_substate != JOB_SUBSTATE_TRANSICM)
    {
    log_err(errno, "req_commit", "cannot commit job in unexpected state");

    req_reject(PBSE_IVALREQ, 0, preq, NULL, NULL);

    return;
    }

  /* move job from new job list to "all" job list, set to running state */

  delete_link(&pj->ji_alljobs);

  append_link(&svr_alljobs, &pj->ji_alljobs, pj);

  /*
  ** Set JOB_SVFLG_HERE to indicate that this is Mother Superior.
  */

  pj->ji_qs.ji_svrflags |= JOB_SVFLG_HERE;

  pj->ji_qs.ji_state = JOB_STATE_RUNNING;

  pj->ji_qs.ji_substate = JOB_SUBSTATE_PRERUN;

  pj->ji_qs.ji_un_type = JOB_UNION_TYPE_MOM;

  pj->ji_qs.ji_un.ji_momt.ji_svraddr = get_connectaddr(preq->rq_conn);

  pj->ji_qs.ji_un.ji_momt.ji_exitstat = 0;

#ifdef HAVE_GLITE_LB
  svr_logjobstate(pj, JOB_STATE_RUNNING, JOB_SUBSTATE_PRERUN, preq);
#endif

  /* For MOM - start up the job (blocks) */

  if (LOGLEVEL >= 6)
    {
    log_record(
      PBSEVENT_JOB,
      PBS_EVENTCLASS_JOB,
      (pj != NULL) ? pj->ji_qs.ji_jobid : "NULL",
      "starting job execution");
    }

  // force poll of other jobs to prevent memory crashes
  last_poll_time = 0;

  start_exec(pj);

  if (LOGLEVEL >= 6)
    {
    log_record(
      PBSEVENT_JOB,
      PBS_EVENTCLASS_JOB,
      (pj != NULL) ? pj->ji_qs.ji_jobid : "NULL",
      "job execution started");
    }

  /* if start request fails, reply with failure string */

  if (pj->ji_qs.ji_substate == JOB_SUBSTATE_EXITING)
    {
    char tmpLine[1024];

    if ((pj->ji_hosts != NULL) &&
        (pj->ji_nodekill >= 0) &&
        (pj->ji_hosts[pj->ji_nodekill].hn_host != NULL))
      {
      sprintf(tmpLine, "start failed on node %s",
              pj->ji_hosts[pj->ji_nodekill].hn_host);
      }
    else
      {
      sprintf(tmpLine, "start failed on unknown node");
      }

    if (LOGLEVEL >= 6)
      {
      log_record(
        PBSEVENT_JOB,
        PBS_EVENTCLASS_JOB,
        (pj != NULL) ? pj->ji_qs.ji_jobid : "NULL",
        tmpLine);
      }

    reply_text(preq, 0, tmpLine);
    }
  else
    {
    reply_jobid(preq, pj->ji_qs.ji_jobid, BATCH_REPLY_CHOICE_Commit);
    }

  if (is_cloud_job(pj) && pj->ji_numnodes == 1)
    {
    if (cloud_exec(pj, 1) == 0)
      cloud_set_running(pj);

    if (mom_do_poll(pj) != 0)
      append_link(&mom_polljobs, &pj->ji_jobque, pj);
    }

  job_save(pj, SAVEJOB_FULL);

  /* NOTE: we used to flag JOB_ATR_errpath, JOB_ATR_outpath,
   * JOB_ATR_session_id, and JOB_ATR_altid as modified at this point to make sure
   * pbs_server got these attr values.  This worked fine before TORQUE modified
   * job launched into an async process.  At 2.0.0p6, a new attribute "SEND" flag
   * was added to handle this process. */

  return;
  }  /* END req_commit() */
Esempio n. 2
0
/**
 * @brief
 *		que_purge - purge queue from system
 *		The queue is dequeued, the queue file is unlinked.
 *		If the queue contains any jobs, the purge is not allowed.
 *		Eventually the queue is deleted from the database
 *
 * @param[in]	pque	- The pointer to the queue to purge
 *
 * @return	error code
 * @retval	0	- queue purged or queue not valid
 * @retval	PBSE_OBJBUSY	- queue deletion not allowed
 */
int
que_purge(pbs_queue *pque)
{
	pbs_db_obj_info_t   obj;
	pbs_db_que_info_t   dbque;
	pbs_db_conn_t *conn = (pbs_db_conn_t *) svr_db_conn;

	/*
	 * If the queue (pque) is not valid, then nothing to
	 * do, just return 0.
	 */
	if (pque == NULL)
		return (0);

	/* are there any jobs still in the queue */
	if (pque->qu_numjobs != 0) {
		/*
		 * If the queue still has job(s), check if the SERVER
		 * is configured for history info and all the jobs in
		 * queue are history jobs. If yes, then allow queue
		 * deletion otherwise return PBSE_OBJBUSY.
		 */
		if (svr_history_enable) { /* SVR histconf chk */

			job 	*pjob = (job *)0;
			job 	*nxpjob = (job *)0;

			pjob = (job *)GET_NEXT(pque->qu_jobs);
			while (pjob) {
				/*
				 * If it is not a history job (MOVED/FINISHED), then
				 * return with PBSE_OBJBUSY error.
				 */
				if ((pjob->ji_qs.ji_state != JOB_STATE_MOVED) &&
					(pjob->ji_qs.ji_state != JOB_STATE_FINISHED))
					return (PBSE_OBJBUSY);
				pjob = (job *)GET_NEXT(pjob->ji_jobque);
			}
			/*
			 * All are history jobs, unlink all of them from queue.
			 * Update the number of jobs in the queue and their state
			 * count as the queue is going to be purged. No job(s)
			 * should point to the queue to be purged, make the queue
			 * header pointer of job(pjob->ji_qhdr) to NULL.
			 */
			pjob = (job *)GET_NEXT(pque->qu_jobs);
			while (pjob) {
				nxpjob = (job *)GET_NEXT(pjob->ji_jobque);
				delete_link(&pjob->ji_jobque);
				--pque->qu_numjobs;
				--pque->qu_njstate[pjob->ji_qs.ji_state];
				pjob->ji_qhdr = (pbs_queue *)0;
				pjob = nxpjob;
			}
		} else {
			return (PBSE_OBJBUSY);
		}
	}

	/* delete queue from database */
	strcpy(dbque.qu_name, pque->qu_qs.qu_name);
	obj.pbs_db_obj_type = PBS_DB_QUEUE;
	obj.pbs_db_un.pbs_db_que = &dbque;
	if (pbs_db_delete_obj(conn, &obj) != 0) {
		(void)sprintf(log_buffer,
			"delete of que %s from datastore failed",
			pque->qu_qs.qu_name);
		log_err(errno, "queue_purge", log_buffer);
	}
	que_free(pque);

	return (0);
}
Esempio n. 3
0
void req_quejob(

  struct batch_request *preq) /* ptr to the decoded request   */

  {
  char  *id = "req_quejob";

  char   basename[PBS_JOBBASE + 1];
  int    created_here = 0;
  int    index;
  char  *jid;
  attribute_def *pdef;
  job   *pj;
  svrattrl *psatl;
  int    rc;
  int    sock = preq->rq_conn;

  int    IsCheckpoint = 0;

  /* set basic (user) level access permission */

  resc_access_perm = ATR_DFLAG_USWR | ATR_DFLAG_Creat;

  if (PBSNodeCheckProlog)
    {
    check_state(1);

    mom_server_all_update_stat();

    if (internal_state & INUSE_DOWN)
      {
      req_reject(PBSE_MOMREJECT,0,preq,NULL,NULL);

      return;
      }
    }

  if (preq->rq_fromsvr)
    {
    /* from another server - accept the extra attributes */

    resc_access_perm |= ATR_DFLAG_MGWR | ATR_DFLAG_SvWR | ATR_DFLAG_MOM;

    jid = preq->rq_ind.rq_queuejob.rq_jid;
    }
  else
    {
    /* request must be from server */

    log_err(errno, id, "request not from server");

    req_reject(PBSE_IVALREQ, 0, preq, NULL, "request not received from server");

    return;
    }

  /* does job already exist, check both old and new jobs */

  if ((pj = find_job(jid)) == NULL)
    {
    pj = (job *)GET_NEXT(svr_newjobs);

    while (pj != NULL)
      {
      if (!strcmp(pj->ji_qs.ji_jobid, jid))
        break;

      pj = (job *)GET_NEXT(pj->ji_alljobs);
      }
    }

  /*
   * New job ...
   *
   * for MOM - rather than make up a hashname, we use the name sent
   * to us by the server as an attribute.
   */

  psatl = (svrattrl *)GET_NEXT(preq->rq_ind.rq_queuejob.rq_attr);

  while (psatl != NULL)
    {
    if (!strcmp(psatl->al_name,ATTR_hashname))
      {
      strcpy(basename,psatl->al_value);

      break;
      }

    psatl = (svrattrl *)GET_NEXT(psatl->al_link);
    }

  if (pj != NULL)
    {
    /* newly queued job already exists */

    if (pj->ji_qs.ji_substate == JOB_SUBSTATE_RUNNING)
      {
      /* FAILURE - job exists and is running */

      log_err(errno,id,"cannot queue new job, job exists and is running");

      req_reject(PBSE_JOBEXIST,0,preq,NULL,"job is running");

      return;
      }

    /* if checkpointed, then keep old and skip rest of process */

    if (pj->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE)
      {
      IsCheckpoint = 1;
      }  /* END if (pj->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) */
    else
      {
      /* unlink job from svr_alljobs since it will be placed on newjobs */

      delete_link(&pj->ji_alljobs);
      }
    }  /* END if (pj != NULL) */
  else
    {
    /* if not already here, allocate job struct */

    if ((pj = job_alloc()) == NULL)
      {
      /* FAILURE */

      req_reject(PBSE_SYSTEM, 0, preq, NULL, "cannot allocate new job structure");

      return;
      }
    }    /* END else (pj != NULL) */

  if (IsCheckpoint == 0)
    {
    strcpy(pj->ji_qs.ji_jobid,jid);

    strcpy(pj->ji_qs.ji_fileprefix,basename);

    pj->ji_modified       = 1;

    pj->ji_qs.ji_svrflags = created_here;

    pj->ji_qs.ji_un_type  = JOB_UNION_TYPE_NEW;
    }

  /* decode attributes from request into job structure */

  psatl = (svrattrl *)GET_NEXT(preq->rq_ind.rq_queuejob.rq_attr);

  while (psatl != NULL)
    {
    if (IsCheckpoint == 1)
      {
      if (strcmp(psatl->al_name,ATTR_checkpoint_name) &&
          strcmp(psatl->al_name,ATTR_v))
        {
        psatl = (svrattrl *)GET_NEXT(psatl->al_link);

        continue;
        }
      }

    /* identify the attribute by name */

    index = find_attr(job_attr_def,psatl->al_name,JOB_ATR_LAST);

    if (index < 0)
      {
      /* FAILURE */

#if 0
      /* old implementation, refuse jobs with unknown attributes */
      /* didn`t recognize the name */

      job_purge(pj);   /* CRI - 12/20/2004 */

      reply_badattr(PBSE_NOATTR,1,psatl,preq);

      return;
#endif

      /* new implementation, ignore unknown attributes */
      sprintf(log_buffer, "Unknown attribute \"%s\" received in req_quejob request", psatl->al_name);
      LOG_EVENT(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pj->ji_qs.ji_jobid, log_buffer);
      psatl = (svrattrl *)GET_NEXT(psatl->al_link);
      continue;
      }

    pdef = &job_attr_def[index];

    /* Is attribute not writeable by manager or by a server? */

    if ((pdef->at_flags & resc_access_perm) == 0)
      {
      /* FAILURE */

      job_purge(pj);

      reply_badattr(PBSE_ATTRRO,1,psatl,preq);

      return;
      }

    /* decode attribute */

    if (!strcmp(psatl->al_name,ATTR_v))
      {
      rc = decode_arst_merge(
             &pj->ji_wattr[index],
             psatl->al_name,
             psatl->al_resc,
             psatl->al_value);
      }
    else
      {
      rc = pdef->at_decode(
             &pj->ji_wattr[index],
             psatl->al_name,
             psatl->al_resc,
             psatl->al_value);
      }

    if (rc != 0)
      {
      /* FAILURE */

      /* all errors are fatal for MOM */
      if (rc != PBSE_UNKRESC)
        {
        job_purge(pj);

        reply_badattr(rc,1,psatl,preq);

        return;
        }
      psatl = (svrattrl *)GET_NEXT(psatl->al_link);
      continue;
      }

    if (psatl->al_op == DFLT)
      {
      if (psatl->al_resc)
        {
#if 0   /* don't mark resources as default on nodes, we need all resources stored in the jobfile */
        resource     *presc;
        resource_def *prdef;

        prdef = find_resc_def(svr_resc_def,psatl->al_resc,svr_resc_size);

        if (prdef == NULL)
          {
          job_purge(pj);

          reply_badattr(rc,1,psatl, preq);

          return;
          }

        presc = find_resc_entry(&pj->ji_wattr[index],prdef);

        if (presc != NULL)
          presc->rs_value.at_flags |= ATR_VFLAG_DEFLT;
#endif
        }
      else
        {
        pj->ji_wattr[index].at_flags |= ATR_VFLAG_DEFLT;
        }
      }    /* END if (psatl->al_op == DFLT) */

    psatl = (svrattrl *)GET_NEXT(psatl->al_link);
    }      /* END while (psatl != NULL) */

  if (IsCheckpoint == 1)
    {
    pj->ji_qs.ji_substate = JOB_SUBSTATE_TRANSIN;

    if (reply_jobid(preq,pj->ji_qs.ji_jobid,BATCH_REPLY_CHOICE_Queue) == 0)
      {
      delete_link(&pj->ji_alljobs);

      append_link(&svr_newjobs,&pj->ji_alljobs,pj);

      pj->ji_qs.ji_un_type = JOB_UNION_TYPE_NEW;
      pj->ji_qs.ji_un.ji_newt.ji_fromsock = sock;
      pj->ji_qs.ji_un.ji_newt.ji_fromaddr = get_connectaddr(sock);
      pj->ji_qs.ji_un.ji_newt.ji_scriptsz = 0;

      /* Per Eric R., req_mvjobfile was giving error in open_std_file,
         showed up as fishy error message */

      if (pj->ji_grpcache != NULL)
        {
        free(pj->ji_grpcache);
        pj->ji_grpcache = NULL;
        }
      }
    else
      {
      close_conn(sock);
      }

    /* SUCCESS */

    return;
    }

  /* set remaining job structure elements */

  pj->ji_qs.ji_state =    JOB_STATE_TRANSIT;

  pj->ji_qs.ji_substate = JOB_SUBSTATE_TRANSIN;

  pj->ji_wattr[(int)JOB_ATR_mtime].at_val.at_long = (long)time_now;

  pj->ji_wattr[(int)JOB_ATR_mtime].at_flags |= ATR_VFLAG_SET;

  pj->ji_qs.ji_un_type = JOB_UNION_TYPE_NEW;

  pj->ji_qs.ji_un.ji_newt.ji_fromsock = sock;

  pj->ji_qs.ji_un.ji_newt.ji_fromaddr = get_connectaddr(sock);

  pj->ji_qs.ji_un.ji_newt.ji_scriptsz = 0;

  /* acknowledge the request with the job id */

  if (reply_jobid(preq, pj->ji_qs.ji_jobid, BATCH_REPLY_CHOICE_Queue) != 0)
    {
    /* reply failed, purge the job and close the connection */

    close_conn(sock);

    job_purge(pj);

    return;
    }

  /* link job into server's new jobs list request  */

  append_link(&svr_newjobs, &pj->ji_alljobs, pj);

  return;
  }  /* END req_quejob() */
Esempio n. 4
0
int acct_job(

  job            *pjob, /* I */
  dynamic_string *ds)   /* O */

  {
  int         rc;
  long        cray_enabled = FALSE;
  int         resc_access_perm = READ_ONLY;
  char        local_buf[MAXLINE*4];
  pbs_queue  *pque;

  tlist_head  attrlist;
  svrattrl   *pal;

  if (pjob == NULL)
    {
    return(PBSE_NONE);
    }

  CLEAR_HEAD(attrlist);

  /* user */

	/* acct_job is only called from account_jobstr and account_jobend. BufSize should be
	 	 PBS_ACCT_MAX_RCD + 1 in size. */
  sprintf(local_buf, "user=%s ",
    pjob->ji_wattr[JOB_ATR_euser].at_val.at_str);
  if ((rc = append_dynamic_string(ds, local_buf)) != PBSE_NONE)
    return(rc);

  /* group */
  sprintf(local_buf, "group=%s ",
    pjob->ji_wattr[JOB_ATR_egroup].at_val.at_str);
  if ((rc = append_dynamic_string(ds, local_buf)) != PBSE_NONE)
    return(rc);

  /* account */
  if (pjob->ji_wattr[JOB_ATR_account].at_flags & ATR_VFLAG_SET)
    {
    sprintf(local_buf, "account=%s ",
      pjob->ji_wattr[JOB_ATR_account].at_val.at_str);
    if ((rc = append_dynamic_string(ds, local_buf)) != PBSE_NONE)
      return(rc);
    }

  /* job name */
  sprintf(local_buf, "jobname=%s ",
    pjob->ji_wattr[JOB_ATR_jobname].at_val.at_str);
  if ((rc = append_dynamic_string(ds, local_buf)) != PBSE_NONE)
    return(rc);

  if ((pque = get_jobs_queue(&pjob)) != NULL)
    {
    /* queue name */
    sprintf(local_buf, "queue=%s ", pque->qu_qs.qu_name);
    unlock_queue(pque, __func__, NULL, LOGLEVEL);

    if ((rc = append_dynamic_string(ds, local_buf)) != PBSE_NONE)
      return(rc);
    }
  else if (pjob == NULL)
    {
    log_err(PBSE_JOBNOTFOUND, __func__, "Job lost while acquiring queue 1");
    return(PBSE_JOBNOTFOUND);
    }

  /* create time */
  sprintf(local_buf, "ctime=%ld ",
    pjob->ji_wattr[JOB_ATR_ctime].at_val.at_long);
  if ((rc = append_dynamic_string(ds, local_buf)) != PBSE_NONE)
    return(rc);

  /* queued time */
  sprintf(local_buf, "qtime=%ld ",
    pjob->ji_wattr[JOB_ATR_qtime].at_val.at_long);
  if ((rc = append_dynamic_string(ds, local_buf)) != PBSE_NONE)
    return(rc);

  /* eligible time, how long ready to run */
  sprintf(local_buf, "etime=%ld ",
    pjob->ji_wattr[JOB_ATR_etime].at_val.at_long);
  if ((rc = append_dynamic_string(ds, local_buf)) != PBSE_NONE)
    return(rc);

  /* execution start time */
  sprintf(local_buf, "start=%ld ",
    (long)pjob->ji_qs.ji_stime);
  if ((rc = append_dynamic_string(ds, local_buf)) != PBSE_NONE)
    return(rc);

  /* user */
  sprintf(local_buf, "owner=%s ",
    pjob->ji_wattr[JOB_ATR_job_owner].at_val.at_str);
  if ((rc = append_dynamic_string(ds, local_buf)) != PBSE_NONE)
    return(rc);
 
  /* For large clusters strings can get pretty long. We need to see if there
     is a need to allocate a bigger buffer */
  /* execution host name */
  append_dynamic_string(ds, "exec_host=");
  append_dynamic_string(ds, pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str);
  if ((rc = append_dynamic_string(ds, " ")) != PBSE_NONE)
    return(rc);

  get_svr_attr_l(SRV_ATR_CrayEnabled, &cray_enabled);
  if ((cray_enabled == TRUE) &&
      (pjob->ji_wattr[JOB_ATR_login_node_id].at_flags & ATR_VFLAG_SET))
    {
    append_dynamic_string(ds, "login_node=");
    append_dynamic_string(ds, pjob->ji_wattr[JOB_ATR_login_node_id].at_val.at_str);
    if ((rc = append_dynamic_string(ds, " ")) != PBSE_NONE)
      return(rc);
    }

  /* now encode the job's resource_list pbs_attribute */
  job_attr_def[JOB_ATR_resource].at_encode(
    &pjob->ji_wattr[JOB_ATR_resource],
    &attrlist,
    job_attr_def[JOB_ATR_resource].at_name,
    NULL,
    ATR_ENCODE_CLIENT,
    resc_access_perm);

  while ((pal = GET_NEXT(attrlist)) != NULL)
    {
		/* exec_host can use a lot of buffer space. Use a dynamic string */
    append_dynamic_string(ds, pal->al_name);

    if (pal->al_resc != NULL)
      {
      append_dynamic_string(ds, ".");
      append_dynamic_string(ds, pal->al_resc);
      }

    append_dynamic_string(ds, "=");
    append_dynamic_string(ds, pal->al_value);
    if ((rc = append_dynamic_string(ds, " ")) != PBSE_NONE)
      return(rc);

    delete_link(&pal->al_link);
    free(pal);
    }  /* END while (pal != NULL) */

#ifdef ATTR_X_ACCT

  /* x attributes */
  if (pjob->ji_wattr[JOB_SITE_ATR_x].at_flags & ATR_VFLAG_SET)
    {
    sprintf(local_buf, "x=%s ",
            pjob->ji_wattr[JOB_SITE_ATR_x].at_val.at_str);
    if ((rc = append_dynamic_string(ds, local_buf)) != PBSE_NONE)
      return(rc);
    }

#endif

  /* SUCCESS */

  return(PBSE_NONE);
  }  /* END acct_job() */
Esempio n. 5
0
void req_commit(

  struct batch_request *preq)  /* I */

  {
  unsigned int  momport = 0;
  int           rc;
  job          *pj = locate_new_job(preq->rq_conn, preq->rq_ind.rq_commit);

  if (LOGLEVEL >= 6)
    {
    log_record(
      PBSEVENT_JOB,
      PBS_EVENTCLASS_JOB,
      (pj != NULL) ? pj->ji_qs.ji_jobid : "NULL",
      "committing job");
    }

  if (pj == NULL)
    {
    req_reject(PBSE_UNKJOBID, 0, preq, NULL, NULL);

    return;
    }

  if (pj->ji_qs.ji_substate != JOB_SUBSTATE_TRANSICM)
    {
    log_err(errno, "req_commit", (char *)"cannot commit job in unexpected state");

    req_reject(PBSE_IVALREQ, 0, preq, NULL, NULL);

    return;
    }

  /* move job from new job list to "all" job list, set to running state */

  delete_link(&pj->ji_alljobs);

  append_link(&svr_alljobs, &pj->ji_alljobs, pj);

  /*
  ** Set JOB_SVFLG_HERE to indicate that this is Mother Superior.
  */

  pj->ji_qs.ji_svrflags |= JOB_SVFLG_HERE;

  pj->ji_qs.ji_state = JOB_STATE_RUNNING;

  pj->ji_qs.ji_substate = JOB_SUBSTATE_PRERUN;

  pj->ji_qs.ji_un_type = JOB_UNION_TYPE_MOM;

  pj->ji_qs.ji_un.ji_momt.ji_svraddr = get_connectaddr(preq->rq_conn,FALSE);

  pj->ji_qs.ji_un.ji_momt.ji_exitstat = 0;

  /* For MOM - start up the job (blocks) */

  if (LOGLEVEL >= 6)
    {
    log_record(
      PBSEVENT_JOB,
      PBS_EVENTCLASS_JOB,
      (pj != NULL) ? pj->ji_qs.ji_jobid : "NULL",
      "req_commit:starting job execution");
    }

  rc = start_exec(pj);

  if (LOGLEVEL >= 6)
    {
    log_record(
      PBSEVENT_JOB,
      PBS_EVENTCLASS_JOB,
      pj->ji_qs.ji_jobid,
      "req_commit:job execution started");
    }

  /* if start request fails, reply with failure string */

  if (pj->ji_qs.ji_substate == JOB_SUBSTATE_EXITING)
    {
    char tmpLine[1024];

    if ((pj->ji_hosts != NULL) &&
        (pj->ji_nodekill >= 0) &&
        (pj->ji_hosts[pj->ji_nodekill].hn_host != NULL))
      {
      sprintf(tmpLine, "start failed on node %s",
              pj->ji_hosts[pj->ji_nodekill].hn_host);
      }
    else
      {
      sprintf(tmpLine, "start failed on unknown node");
      }

    if (LOGLEVEL >= 6)
      {
      log_record(
        PBSEVENT_JOB,
        PBS_EVENTCLASS_JOB,
        (pj != NULL) ? pj->ji_qs.ji_jobid : "NULL",
        tmpLine);
      }

    reply_text(preq, rc, tmpLine);
    }
  else
    {
    reply_sid(preq, pj->ji_wattr[JOB_ATR_session_id].at_val.at_long,BATCH_REPLY_CHOICE_Text);
    }

  if (multi_mom)
    {
    momport = pbs_rm_port;
    }

  job_save(pj, SAVEJOB_FULL, momport);

#ifdef NVIDIA_GPUS
  /*
   * Does this job have a gpuid assigned?
   * if so, then update gpu status
   */
  if ((use_nvidia_gpu) && 
      ((pj->ji_wattr[JOB_ATR_exec_gpus].at_flags & ATR_VFLAG_SET) != 0) &&
      (pj->ji_wattr[JOB_ATR_exec_gpus].at_val.at_str != NULL))
    {
    send_update_soon();
    }
#endif  /* NVIDIA_GPUS */


  /* NOTE: we used to flag JOB_ATR_errpath, JOB_ATR_outpath,
   * JOB_ATR_session_id, and JOB_ATR_altid as modified at this point to make sure
   * pbs_server got these attr values.  This worked fine before TORQUE modified
   * job launched into an async process.  At 2.0.0p6, a new pbs_attribute "SEND" flag
   * was added to handle this process. */

  return;
  }  /* END req_commit() */
void free_br(

  struct batch_request *preq)

  {
  delete_link(&preq->rq_link);

  reply_free(&preq->rq_reply);

  if (preq->rq_extend)
    free(preq->rq_extend);

  switch (preq->rq_type)
    {
    case PBS_BATCH_QueueJob:

      free_attrlist(&preq->rq_ind.rq_queuejob.rq_attr);

      break;

    case PBS_BATCH_JobCred:

      if (preq->rq_ind.rq_jobcred.rq_data)
        free(preq->rq_ind.rq_jobcred.rq_data);

      break;

    case PBS_BATCH_MvJobFile:

    case PBS_BATCH_jobscript:

      if (preq->rq_ind.rq_jobfile.rq_data)
        free(preq->rq_ind.rq_jobfile.rq_data);

      break;

    case PBS_BATCH_HoldJob:

      freebr_manage(&preq->rq_ind.rq_hold.rq_orig);

      break;

    case PBS_BATCH_CheckpointJob:

      freebr_manage(&preq->rq_ind.rq_manager);

      break;

    case PBS_BATCH_MessJob:

      if (preq->rq_ind.rq_message.rq_text)
        free(preq->rq_ind.rq_message.rq_text);

      break;

    case PBS_BATCH_ModifyJob:

    case PBS_BATCH_AsyModifyJob:

      freebr_manage(&preq->rq_ind.rq_modify);

      break;

    case PBS_BATCH_StatusJob:

    case PBS_BATCH_StatusQue:

    case PBS_BATCH_StatusNode:

    case PBS_BATCH_StatusSvr:
      /* DIAGTODO: handle PBS_BATCH_StatusDiag */

      free_attrlist(&preq->rq_ind.rq_status.rq_attr);

      break;

    case PBS_BATCH_JobObit:

      free_attrlist(&preq->rq_ind.rq_jobobit.rq_attr);

      break;

    case PBS_BATCH_CopyFiles:

    case PBS_BATCH_DelFiles:

      freebr_cpyfile(&preq->rq_ind.rq_cpyfile);

      break;

    default:

      /* NO-OP */

      break;
    }  /* END switch (preq->rq_type) */

  free(preq);

  return;
  }  /* END free_br() */
Esempio n. 7
0
int save_attr(

  struct attribute_def *padef,           /* pbs_attribute definition array */
  pbs_attribute        *pattr,           /* ptr to pbs_attribute value array */
  int                   numattr,         /* number of attributes in array */
  int                   fds,
  char                 *buf_ptr,         /* M */
  size_t               *space_remaining, /* O */
  size_t                buf_size)        /* I */

  {
  svrattrl    dummy;
  int         errct = 0;
  tlist_head  lhead;
  int         i;
  int         resc_access_perm = ATR_DFLAG_ACCESS;
  svrattrl   *pal;
  int         rc;

  /* encode each pbs_attribute which has a value (not non-set) */

  CLEAR_HEAD(lhead);

  for (i = 0;i < numattr;i++)
    {
    if ((padef + i)->at_type != ATR_TYPE_ACL)
      {
      /* NOTE: access lists are not saved this way */

      rc = (padef + i)->at_encode(
          pattr + i,
          &lhead,
          (padef + i)->at_name,
          NULL,
          ATR_ENCODE_SAVE,
          resc_access_perm);

      if (rc < 0)
        errct++;

      (pattr + i)->at_flags &= ~ATR_VFLAG_MODIFY;

      /* now that it has been encoded, block and save it */

      while ((pal = (svrattrl *)GET_NEXT(lhead)) != NULL)
        {
        if (save_struct((char *)pal, pal->al_tsize, fds, buf_ptr, space_remaining, buf_size) < 0)
          errct++;

        delete_link(&pal->al_link);

        free(pal);
        }
      }
    }  /* END for (i) */

  /* indicate last of attributes by writing dummy entry */

  memset(&dummy, 0, sizeof(dummy));

  dummy.al_tsize = ENDATTRIBUTES;

  if (save_struct((char *)&dummy, sizeof(dummy), fds, buf_ptr, space_remaining, buf_size) < 0)
    errct++;

  if (errct != 0)
    {
    return(-1);
    }

  /* SUCCESS */

  return(0);
  }  /* END save_attr() */
Esempio n. 8
0
/* delete a job array struct from memory and disk. This is used when the number
 *  of jobs that belong to the array becomes zero.
 *  returns zero if there are no errors, non-zero otherwise
 */
int array_delete(
    
  job_array *pa)

  {
  int                      i;
  char                     path[MAXPATHLEN + 1];
  char                     log_buf[LOCAL_LOG_BUF_SIZE];
  array_request_node      *rn;
  struct array_depend     *pdep;
  struct array_depend_job *pdj;

  /* first thing to do is take this out of the servers list of all arrays */
  remove_array(pa);

  /* unlock the mutex and free it */
  unlock_ai_mutex(pa, __func__, "1", LOGLEVEL);
  free(pa->ai_mutex);

  /* delete the on disk copy of the struct */
  snprintf(path, sizeof(path), "%s%s%s",
    path_arrays, pa->ai_qs.fileprefix, ARRAY_FILE_SUFFIX);

  if (unlink(path))
    {
    sprintf(log_buf, "unable to delete %s", path);
    log_err(errno, "array_delete", log_buf);
    }

  /* clear array request linked list */
  for (rn = (array_request_node *)GET_NEXT(pa->request_tokens);
       rn != NULL;
       rn = (array_request_node *)GET_NEXT(pa->request_tokens))
    {
    delete_link(&rn->request_tokens_link);
    free(rn);
    }

  /* free the memory for the job pointers */
  for (i = 0; i < pa->ai_qs.array_size; i++)
    {
    if (pa->job_ids[i] != NULL)
      free(pa->job_ids[i]);
    }

  free(pa->job_ids);

  /* free the dependencies, if any */
  for (pdep = (struct array_depend *)GET_NEXT(pa->ai_qs.deps); 
       pdep != NULL;
       pdep = (struct array_depend *)GET_NEXT(pa->ai_qs.deps))
    {
    delete_link(&pdep->dp_link);

    for (pdj = (struct array_depend_job *)GET_NEXT(pdep->dp_jobs);
         pdj != NULL;
         pdj = (struct array_depend_job *)GET_NEXT(pdep->dp_jobs))
      {
      delete_link(&pdj->dc_link);
      free(pdj);
      }

    free(pdep);
    }

  /* purge the "template" job, 
     this also deletes the shared script file for the array*/
  if (pa->ai_qs.parent_id[0] != '\0')
    {
    job *pjob;
    if ((pjob = svr_find_job(pa->ai_qs.parent_id, FALSE)) != NULL)
      svr_job_purge(pjob);
    }

  /* free the memory allocated for the struct */
  free(pa);

  return(PBSE_NONE);
  } /* END array_delete() */
Esempio n. 9
0
/* array_recov reads in  an array struct saved to disk and inserts it into
   the servers list of arrays */
int array_recov(

  char       *path, 
  job_array **new_pa)

  {
  job_array *pa;
  array_request_node *rn;
  char  log_buf[LOCAL_LOG_BUF_SIZE];
  int   fd;
  int   old_version;
  int   num_tokens;
  int   i;
  int   len;
  int   rc;

  *new_pa = NULL;

  old_version = ARRAY_QS_STRUCT_VERSION;

  /* allocate the storage for the struct */
  pa = (job_array*)calloc(1,sizeof(job_array));

  if (pa == NULL)
    {
    return(PBSE_SYSTEM);
    }

  /* initialize the linked list nodes */

  CLEAR_HEAD(pa->request_tokens);

  fd = open(path, O_RDONLY, 0);
  if(fd < 0)
    {
    free(pa);
    return(PBSE_SYSTEM);
    }

  if (array_259_upgrade)
    {
    rc = read_and_convert_259_array(fd, pa, path);
    if (rc != PBSE_NONE)
      {
      free(pa);
      close(fd);
      return(rc);
      }
    }
  else
    {

    /* read the file into the struct previously allocated.
     */

    len = read_ac_socket(fd, &(pa->ai_qs), sizeof(pa->ai_qs));
    if ((len < 0) || ((len < (int)sizeof(pa->ai_qs)) && (pa->ai_qs.struct_version == ARRAY_QS_STRUCT_VERSION)))
      {
      sprintf(log_buf, "error reading %s", path);
      log_err(errno, __func__, log_buf);
      free(pa);
      close(fd);
      return(PBSE_SYSTEM);
      }

    if (pa->ai_qs.struct_version != ARRAY_QS_STRUCT_VERSION)
      {
      rc = array_upgrade(pa, fd, pa->ai_qs.struct_version, &old_version);
      if (rc)
        {
        sprintf(log_buf, "Cannot upgrade array version %d to %d", pa->ai_qs.struct_version, ARRAY_QS_STRUCT_VERSION);
        log_err(errno, __func__, log_buf);
        free(pa);
        close(fd);
        return(rc);
        }
      }
    }

  pa->job_ids = (char **)calloc(pa->ai_qs.array_size, sizeof(char *));

  /* check to see if there is any additional info saved in the array file */
  /* check if there are any array request tokens that haven't been fully
     processed */

  if (old_version > 1)
    {
    if (read_ac_socket(fd, &num_tokens, sizeof(int)) != sizeof(int))
      {
      sprintf(log_buf, "error reading token count from %s", path);
      log_err(errno, __func__, log_buf);

      free(pa);
      close(fd);
      return(PBSE_SYSTEM);
      }

    for (i = 0; i < num_tokens; i++)
      {
      rn = (array_request_node *)calloc(1, sizeof(array_request_node));

      if (read_ac_socket(fd, rn, sizeof(array_request_node)) != sizeof(array_request_node))
        {
        sprintf(log_buf, "error reading array_request_node from %s", path);
        log_err(errno, __func__, log_buf);

        free(rn);

        for (rn = (array_request_node*)GET_NEXT(pa->request_tokens);
            rn != NULL;
            rn = (array_request_node*)GET_NEXT(pa->request_tokens))
          {
          delete_link(&rn->request_tokens_link);
          free(rn);
          }

        free(pa);

        close(fd);
        return(PBSE_SYSTEM);
        }

      CLEAR_LINK(rn->request_tokens_link);

      append_link(&pa->request_tokens, &rn->request_tokens_link, (void*)rn);

      }

    }

  close(fd);

  CLEAR_HEAD(pa->ai_qs.deps);

  if (old_version != ARRAY_QS_STRUCT_VERSION)
    {
    /* resave the array struct if the version on disk is older than the current */
    array_save(pa);
    }

  pa->ai_mutex = (pthread_mutex_t *)calloc(1, sizeof(pthread_mutex_t));
  pthread_mutex_init(pa->ai_mutex,NULL);

  lock_ai_mutex(pa, __func__, NULL, LOGLEVEL);

  /* link the struct into the servers list of job arrays */
  insert_array(pa);

  *new_pa = pa;

  return(PBSE_NONE);
  } /* END array_recov() */
Esempio n. 10
0
void mom_job_purge(

  job *pjob)  /* I (modified) */

  {
  job_file_delete_info *jfdi;

  jfdi = (job_file_delete_info *)calloc(1, sizeof(job_file_delete_info));

  if (jfdi == NULL)
    {
    log_err(ENOMEM,__func__, (char *)"No space to allocate info for job file deletion");
    return;
    }

#ifdef NVIDIA_GPUS
  /*
   * Did this job have a gpuid assigned?
   * if so, then update gpu status
   */
  if (((pjob->ji_wattr[JOB_ATR_exec_gpus].at_flags & ATR_VFLAG_SET) != 0) &&
      (pjob->ji_wattr[JOB_ATR_exec_gpus].at_val.at_str != NULL))
    {
    send_update_soon();
    }
#endif  /* NVIDIA_GPUS */

  /* initialize struct information */
  if (pjob->ji_flags & MOM_HAS_TMPDIR)
    {
    jfdi->has_temp_dir = TRUE;
    pjob->ji_flags &= ~MOM_HAS_TMPDIR;
    }
  else
    jfdi->has_temp_dir = FALSE;

  strcpy(jfdi->jobid,pjob->ji_qs.ji_jobid);
  strcpy(jfdi->prefix,pjob->ji_qs.ji_fileprefix);

  if ((pjob->ji_wattr[JOB_ATR_checkpoint_dir].at_flags & ATR_VFLAG_SET) &&
      (pjob->ji_wattr[JOB_ATR_checkpoint_name].at_flags & ATR_VFLAG_SET))
    jfdi->checkpoint_dir = strdup(pjob->ji_wattr[JOB_ATR_checkpoint_dir].at_val.at_str);

  jfdi->gid = pjob->ji_qs.ji_un.ji_momt.ji_exgid;
  jfdi->uid = pjob->ji_qs.ji_un.ji_momt.ji_exuid;

  if (thread_unlink_calls == TRUE)
    enqueue_threadpool_request(delete_job_files,jfdi);
  else
    delete_job_files(jfdi);

  /* remove this job from the global queue */
  delete_link(&pjob->ji_jobque);
  delete_link(&pjob->ji_alljobs);

  if (LOGLEVEL >= 6)
    {
    sprintf(log_buffer,"removing job");

    log_record(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buffer);
    }

#if IBM_SP2==2        /* IBM SP PSSP 3.1 */
  unload_sp_switch(pjob);

#endif   /* IBM SP */

  mom_job_free(pjob);

  /* if no jobs are left, check if MOM should be restarted */

  if (((job *)GET_NEXT(svr_alljobs)) == NULL)
    MOMCheckRestart();

  return;
  }  /* END mom_job_purge() */
Esempio n. 11
0
int add_encoded_attributes(

  xmlNodePtr     *attr_node, /* M attribute node */ 
  pbs_attribute  *pattr)     /* M ptr to pbs_attribute value array */

  {
  tlist_head  lhead;
  int         i;
  int         resc_access_perm = ATR_DFLAG_ACCESS;
  svrattrl   *pal;
  int         rc = PBSE_NONE;
  xmlNodePtr  attributeNode = *attr_node;
  char        buf[BUFSIZE];
  xmlNodePtr  pal_xmlNode;

  CLEAR_HEAD(lhead);
  xmlNodePtr  resource_list_head_node = NULL;
  xmlNodePtr  resource_used_head_node = NULL;
  xmlNodePtr  complete_req_head_node = NULL;

  for (i = 0; ((i < JOB_ATR_LAST) && (rc >= 0)); i++)
    {
    if ((job_attr_def[i].at_type != ATR_TYPE_ACL) &&
        ((pattr + i)->at_flags & ATR_VFLAG_SET))
      {
      if ((i != JOB_ATR_resource) &&
          (i != JOB_ATR_resc_used) &&
          (i != JOB_ATR_req_information))
        {
        std::string value;

#ifndef PBS_MOM
        if (i == JOB_ATR_depend)
          translate_dependency_to_string(pattr + i, value);
        else
#endif
          attr_to_str(value, job_attr_def + i, pattr[i], true);

        if (value.size() == 0)
          continue;

        pal_xmlNode = xmlNewChild(attributeNode,
                                  NULL,
                                  (xmlChar *)job_attr_def[i].at_name,
                                  (const xmlChar *)value.c_str());

        if (pal_xmlNode)
          {
          snprintf(buf, sizeof(buf), "%u", (unsigned int)pattr[i].at_flags);
          xmlSetProp(pal_xmlNode, (const xmlChar *)AL_FLAGS_ATTR, (const xmlChar *)buf);
          (pattr + i)->at_flags &= ~ATR_VFLAG_MODIFY;
          }
        }
      else
        {
        rc = job_attr_def[i].at_encode(pattr + i,
            &lhead,
            job_attr_def[i].at_name,
            NULL,
            ATR_ENCODE_SAVE,
            resc_access_perm);
        
        if (rc < 0)
          return -1;

        (pattr + i)->at_flags &= ~ATR_VFLAG_MODIFY;

        while ((pal = (svrattrl *)GET_NEXT(lhead)) != NULL)
          {
          if (i == JOB_ATR_resource)
            {
            pal_xmlNode = add_resource_list_attribute(ATTR_l,
                                                      attr_node,
                                                      &resource_list_head_node,
                                                      pal);
            }
          else if (i == JOB_ATR_req_information)
            {
            pal_xmlNode = add_resource_list_attribute(ATTR_req_information,
                                                      attr_node,
                                                      &complete_req_head_node,
                                                      pal);
            }
          else
            {
            pal_xmlNode = add_resource_list_attribute(ATTR_used,
                                                      attr_node,
                                                      &resource_used_head_node,
                                                      pal);
            }

            if (pal_xmlNode)
              {
              snprintf(buf, sizeof(buf), "%u", (unsigned int)pal->al_flags);
              xmlSetProp(pal_xmlNode, (const xmlChar *)AL_FLAGS_ATTR, (const xmlChar *)buf);
              }

            delete_link(&pal->al_link);
            free(pal);
            if (!pal_xmlNode)
              rc = -1;
          }
        }
      }
    }

  return (0);
  } /* END add_encoded_attributes */
Esempio n. 12
0
void
free_br(struct batch_request *preq)
{
	delete_link(&preq->rq_link);
	reply_free(&preq->rq_reply);

	if (preq->rq_parentbr) {
		/*
		 * have a parent who has the original info, so we cannot
		 * free any data malloc-ed outside of the basic structure;
		 * decrement the reference count in the parent and when it
		 * goes to zero,  reply_send() it
		 */
		if (preq->rq_parentbr->rq_refct > 0) {
			if (--preq->rq_parentbr->rq_refct == 0)
				reply_send(preq->rq_parentbr);
		}

		if (preq->rppcmd_msgid)
			free(preq->rppcmd_msgid);

		(void)free(preq);
		return;
	}

	/*
	 * IMPORTANT - free any data that is malloc-ed outside of the
	 * basic batch_request structure below here so it is not freed
	 * when a copy of the structure (for a Array subjob) is freed
	 */
	if (preq->rq_extend)
		(void)free(preq->rq_extend);

	switch (preq->rq_type) {
		case PBS_BATCH_QueueJob:
			free_attrlist(&preq->rq_ind.rq_queuejob.rq_attr);
			break;
		case PBS_BATCH_JobCred:
			if (preq->rq_ind.rq_jobcred.rq_data)
				(void)free(preq->rq_ind.rq_jobcred.rq_data);
			break;
		case PBS_BATCH_UserCred:
			if (preq->rq_ind.rq_usercred.rq_data)
				(void)free(preq->rq_ind.rq_usercred.rq_data);
			break;
		case PBS_BATCH_jobscript:
			if (preq->rq_ind.rq_jobfile.rq_data)
				(void)free(preq->rq_ind.rq_jobfile.rq_data);
			break;
		case PBS_BATCH_CopyHookFile:
			if (preq->rq_ind.rq_hookfile.rq_data)
				(void)free(preq->rq_ind.rq_hookfile.rq_data);
			break;
		case PBS_BATCH_HoldJob:
			freebr_manage(&preq->rq_ind.rq_hold.rq_orig);
			break;
		case PBS_BATCH_MessJob:
			if (preq->rq_ind.rq_message.rq_text)
				(void)free(preq->rq_ind.rq_message.rq_text);
			break;
		case PBS_BATCH_RelnodesJob:
			if (preq->rq_ind.rq_relnodes.rq_node_list)
				(void)free(preq->rq_ind.rq_relnodes.rq_node_list);
			break;
		case PBS_BATCH_PySpawn:
			arrayfree(preq->rq_ind.rq_py_spawn.rq_argv);
			arrayfree(preq->rq_ind.rq_py_spawn.rq_envp);
			break;
		case PBS_BATCH_ModifyJob:
		case PBS_BATCH_ModifyResv:
			freebr_manage(&preq->rq_ind.rq_modify);
			break;

		case PBS_BATCH_RunJob:
		case PBS_BATCH_AsyrunJob:
		case PBS_BATCH_StageIn:
		case PBS_BATCH_ConfirmResv:
			if (preq->rq_ind.rq_run.rq_destin)
				(void)free(preq->rq_ind.rq_run.rq_destin);
			break;
		case PBS_BATCH_StatusJob:
		case PBS_BATCH_StatusQue:
		case PBS_BATCH_StatusNode:
		case PBS_BATCH_StatusSvr:
		case PBS_BATCH_StatusSched:
		case PBS_BATCH_StatusHook:
		case PBS_BATCH_StatusRsc:
		case PBS_BATCH_StatusResv:
			if (preq->rq_ind.rq_status.rq_id)
				free(preq->rq_ind.rq_status.rq_id);
			free_attrlist(&preq->rq_ind.rq_status.rq_attr);
			break;
		case PBS_BATCH_CopyFiles:
		case PBS_BATCH_DelFiles:
			freebr_cpyfile(&preq->rq_ind.rq_cpyfile);
			break;
		case PBS_BATCH_CopyFiles_Cred:
		case PBS_BATCH_DelFiles_Cred:
			freebr_cpyfile_cred(&preq->rq_ind.rq_cpyfile_cred);
			break;
		case PBS_BATCH_MvJobFile:
			if (preq->rq_ind.rq_jobfile.rq_data)
				free(preq->rq_ind.rq_jobfile.rq_data);
			break;

#ifndef PBS_MOM		/* Server Only */

		case PBS_BATCH_SubmitResv:
			free_attrlist(&preq->rq_ind.rq_queuejob.rq_attr);
			break;
		case PBS_BATCH_Manager:
			freebr_manage(&preq->rq_ind.rq_manager);
			break;
		case PBS_BATCH_ReleaseJob:
			freebr_manage(&preq->rq_ind.rq_release);
			break;
		case PBS_BATCH_Rescq:
		case PBS_BATCH_ReserveResc:
		case PBS_BATCH_ReleaseResc:
			free_rescrq(&preq->rq_ind.rq_rescq);
			break;
		case PBS_BATCH_DefSchReply:
			free(preq->rq_ind.rq_defrpy.rq_id);
			free(preq->rq_ind.rq_defrpy.rq_txt);
			break;
		case PBS_BATCH_SelectJobs:
		case PBS_BATCH_SelStat:
			free_attrlist(&preq->rq_ind.rq_select.rq_selattr);
			free_attrlist(&preq->rq_ind.rq_select.rq_rtnattr);
			break;
		case PBS_BATCH_PreemptJobs:
			free(preq->rq_ind.rq_preempt.ppj_list);
			break;
#endif /* PBS_MOM */
	}
	if (preq->rppcmd_msgid)
		free(preq->rppcmd_msgid);
	(void)free(preq);
}