void req_jobcredential(

  struct batch_request *preq)  /* ptr to the decoded request   */

  {
  job *pj;

  pj = locate_new_job(preq->rq_conn, NULL);

  if (pj == NULL)
    {
    req_reject(PBSE_IVALREQ, 0, preq, NULL, NULL);

    return;
    }

  reply_ack(preq);

  return;
  }  /* END req_jobcredential() */
void req_commit(

  struct batch_request *preq)  /* I */

  {
  job   *pj;

  pj = locate_new_job(preq->rq_conn, preq->rq_ind.rq_commit);

  if (LOGLEVEL >= 6)
    {
    log_record(
      PBSEVENT_JOB,
      PBS_EVENTCLASS_JOB,
      (pj != NULL) ? pj->ji_qs.ji_jobid : "NULL",
      "committing job");
    }

  if (pj == NULL)
    {
    req_reject(PBSE_UNKJOBID, 0, preq, NULL, NULL);

    return;
    }

  if (pj->ji_qs.ji_substate != JOB_SUBSTATE_TRANSICM)
    {
    log_err(errno, "req_commit", "cannot commit job in unexpected state");

    req_reject(PBSE_IVALREQ, 0, preq, NULL, NULL);

    return;
    }

  /* move job from new job list to "all" job list, set to running state */

  delete_link(&pj->ji_alljobs);

  append_link(&svr_alljobs, &pj->ji_alljobs, pj);

  /*
  ** Set JOB_SVFLG_HERE to indicate that this is Mother Superior.
  */

  pj->ji_qs.ji_svrflags |= JOB_SVFLG_HERE;

  pj->ji_qs.ji_state = JOB_STATE_RUNNING;

  pj->ji_qs.ji_substate = JOB_SUBSTATE_PRERUN;

  pj->ji_qs.ji_un_type = JOB_UNION_TYPE_MOM;

  pj->ji_qs.ji_un.ji_momt.ji_svraddr = get_connectaddr(preq->rq_conn);

  pj->ji_qs.ji_un.ji_momt.ji_exitstat = 0;

  /* For MOM - start up the job (blocks) */

  if (LOGLEVEL >= 6)
    {
    log_record(
      PBSEVENT_JOB,
      PBS_EVENTCLASS_JOB,
      (pj != NULL) ? pj->ji_qs.ji_jobid : "NULL",
      "starting job execution");
    }

  start_exec(pj);

  if (LOGLEVEL >= 6)
    {
    log_record(
      PBSEVENT_JOB,
      PBS_EVENTCLASS_JOB,
      (pj != NULL) ? pj->ji_qs.ji_jobid : "NULL",
      "job execution started");
    }

  /* if start request fails, reply with failure string */

  if (pj->ji_qs.ji_substate == JOB_SUBSTATE_EXITING)
    {
    char tmpLine[1024];

    if ((pj->ji_hosts != NULL) &&
        (pj->ji_nodekill >= 0) &&
        (pj->ji_hosts[pj->ji_nodekill].hn_host != NULL))
      {
      sprintf(tmpLine, "start failed on node %s",
              pj->ji_hosts[pj->ji_nodekill].hn_host);
      }
    else
      {
      sprintf(tmpLine, "start failed on unknown node");
      }

    if (LOGLEVEL >= 6)
      {
      log_record(
        PBSEVENT_JOB,
        PBS_EVENTCLASS_JOB,
        (pj != NULL) ? pj->ji_qs.ji_jobid : "NULL",
        tmpLine);
      }

    reply_text(preq, 0, tmpLine);
    }
  else
    {
    reply_jobid(preq, pj->ji_qs.ji_jobid, BATCH_REPLY_CHOICE_Commit);
    }

  job_save(pj, SAVEJOB_FULL);

  /* NOTE: we used to flag JOB_ATR_errpath, JOB_ATR_outpath,
   * JOB_ATR_session_id, and JOB_ATR_altid as modified at this point to make sure
   * pbs_server got these attr values.  This worked fine before TORQUE modified
   * job launched into an async process.  At 2.0.0p6, a new attribute "SEND" flag
   * was added to handle this process. */

  return;
  }  /* END req_commit() */
void req_rdytocommit(

  struct batch_request *preq)  /* I */

  {
  job *pj;
  int  sock = preq->rq_conn;

  int  OrigState;
  int  OrigSState;
  char OrigSChar;
  long OrigFlags;

  pj = locate_new_job(sock, preq->rq_ind.rq_rdytocommit);

  if (LOGLEVEL >= 6)
    {
    log_record(
      PBSEVENT_JOB,
      PBS_EVENTCLASS_JOB,
      (pj != NULL) ? pj->ji_qs.ji_jobid : "NULL",
      "ready to commit job");
    }

  if (pj == NULL)
    {
    log_err(errno, "req_rdytocommit", "unknown job id");

    req_reject(PBSE_UNKJOBID, 0, preq, NULL, NULL);

    /* FAILURE */

    return;
    }

  if (pj->ji_qs.ji_substate != JOB_SUBSTATE_TRANSIN)
    {
    log_err(errno, "req_rdytocommit", "cannot commit job in unexpected state");

    req_reject(PBSE_IVALREQ, 0, preq, NULL, NULL);

    /* FAILURE */

    return;
    }

  OrigState  = pj->ji_qs.ji_state;

  OrigSState = pj->ji_qs.ji_substate;
  OrigSChar  = pj->ji_wattr[(int)JOB_ATR_state].at_val.at_char;
  OrigFlags  = pj->ji_wattr[(int)JOB_ATR_state].at_flags;

  pj->ji_qs.ji_state    = JOB_STATE_TRANSIT;
  pj->ji_qs.ji_substate = JOB_SUBSTATE_TRANSICM;
  pj->ji_wattr[(int)JOB_ATR_state].at_val.at_char = 'T';
  pj->ji_wattr[(int)JOB_ATR_state].at_flags |= ATR_VFLAG_SET;

  if (job_save(pj, SAVEJOB_NEW) == -1)
    {
    char tmpLine[1024];

    sprintf(tmpLine, "cannot save job - errno=%d - %s",
            errno,
            strerror(errno));

    log_err(errno, "req_rdytocommit", tmpLine);

    /* commit failed, backoff state changes */

    pj->ji_qs.ji_state    = OrigState;
    pj->ji_qs.ji_substate = OrigSState;
    pj->ji_wattr[(int)JOB_ATR_state].at_val.at_char = OrigSChar;
    pj->ji_wattr[(int)JOB_ATR_state].at_flags = OrigFlags;

    req_reject(PBSE_SYSTEM, 0, preq, NULL, tmpLine);

    /* FAILURE */

    return;
    }

  /* acknowledge the request with the job id */

  if (reply_jobid(preq, pj->ji_qs.ji_jobid, BATCH_REPLY_CHOICE_RdytoCom) != 0)
    {
    /* reply failed, purge the job and close the connection */

    sprintf(log_buffer, "cannot report jobid - errno=%d - %s",
            errno,
            strerror(errno));

    log_err(errno, "req_rdytocommit", log_buffer);

    close_conn(sock);

    job_purge(pj);

    /* FAILURE */

    return;
    }

  if (LOGLEVEL >= 6)
    {
    log_record(
      PBSEVENT_JOB,
      PBS_EVENTCLASS_JOB,
      (pj != NULL) ? pj->ji_qs.ji_jobid : "NULL",
      "ready to commit job completed");
    }

  return;
  }  /* END req_rdytocommit() */
void req_mvjobfile(

  struct batch_request *preq)  /* I */

  {
  int          fds;
  enum job_file  jft;
  int          oflag;
  job         *pj;

  struct passwd *pwd;

  jft = (enum job_file)preq->rq_ind.rq_jobfile.rq_type;

  if (preq->rq_ind.rq_jobfile.rq_sequence == 0)
    oflag = O_CREAT | O_WRONLY | O_TRUNC;
  else
    oflag = O_CREAT | O_WRONLY | O_APPEND;

  pj = locate_new_job(preq->rq_conn, NULL);

  if (pj == NULL)
    pj = find_job(preq->rq_ind.rq_jobfile.rq_jobid);

  if (pj == NULL)
    {
    snprintf(log_buffer, 1024, "cannot find job %s for move of %s file",
             preq->rq_ind.rq_jobfile.rq_jobid,
             TJobFileType[jft]);

    log_err(-1, "req_mvjobfile", log_buffer);

    req_reject(PBSE_UNKJOBID, 0, preq, NULL, NULL);

    return;
    }

  if ((pj->ji_grpcache == NULL) && (check_pwd(pj) == NULL))
    {
    req_reject(PBSE_UNKJOBID, 0, preq, NULL, NULL);

    return;
    }

  if (((pwd = getpwnam(pj->ji_wattr[(int)JOB_ATR_euser].at_val.at_str)) == NULL) ||
      ((fds = open_std_file(pj, jft, oflag, pwd->pw_gid)) < 0))
    {
    /* FAILURE */

    req_reject(PBSE_MOMREJECT, 0, preq, NULL, "password lookup failed");

    return;
    }

  if (write(
        fds,
        preq->rq_ind.rq_jobfile.rq_data,
        preq->rq_ind.rq_jobfile.rq_size) != preq->rq_ind.rq_jobfile.rq_size)
    {
    req_reject(PBSE_SYSTEM, 0, preq, NULL, "cannot create file");
    }
  else
    {
    reply_ack(preq);
    }

  close(fds);

  if (LOGLEVEL >= 6)
    {
    sprintf(log_buffer, "successfully moved %s file for job '%s'",
            TJobFileType[jft],
            preq->rq_ind.rq_jobfile.rq_jobid);

    log_record(
      PBSEVENT_JOB,
      PBS_EVENTCLASS_JOB,
      (pj != NULL) ? pj->ji_qs.ji_jobid : "NULL",
      log_buffer);
    }

  return;
  }  /* END req_mvjobfile() */
void req_jobscript(

  struct batch_request *preq) /* ptr to the decoded request*/

  {
  char *id = "req_jobscript";

  int  fds;
  char  namebuf[MAXPATHLEN];
  job *pj;
  int  filemode = 0700;
  extern char mom_host[];

  errno = 0;

  pj = locate_new_job(preq->rq_conn, preq->rq_ind.rq_jobfile.rq_jobid);

  if (pj == NULL)
    {
    log_err(errno, id, "cannot locate new job");

    req_reject(PBSE_IVALREQ, 0, preq, NULL, NULL);

    return;
    }

  /* what is the difference between JOB_SUBSTATE_TRANSIN and TRANSICM? */

  if (pj->ji_qs.ji_substate != JOB_SUBSTATE_TRANSIN)
    {
    if (errno == 0)
      {
      sprintf(log_buffer, "job %s in unexpected state '%s'",
              pj->ji_qs.ji_jobid,
              PJobSubState[pj->ji_qs.ji_substate]);
      }
    else
      {
      sprintf(log_buffer, "job %s in unexpected state '%s' (errno=%d - %s)",
              pj->ji_qs.ji_jobid,
              PJobSubState[pj->ji_qs.ji_substate],
              errno,
              strerror(errno));
      }

    log_err(errno, id, log_buffer);

    req_reject(PBSE_IVALREQ, 0, preq, mom_host, log_buffer);

    return;
    }

  /* mom - if job has been checkpointed, discard script,already have it */

  if (pj->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE)
    {
    /* SUCCESS - do nothing, ignore script */

    reply_ack(preq);

    return;
    }

  strcpy(namebuf, path_jobs);

  strcat(namebuf, pj->ji_qs.ji_fileprefix);
  strcat(namebuf, JOB_SCRIPT_SUFFIX);

  if (pj->ji_qs.ji_un.ji_newt.ji_scriptsz == 0)
    {
    /* NOTE:  fail is job script already exists */

    fds = open(namebuf, O_WRONLY | O_CREAT | O_EXCL | O_Sync, filemode);
    }
  else
    {
    fds = open(namebuf, O_WRONLY | O_APPEND | O_Sync, filemode);
    }

  if (fds < 0)
    {
    char tmpLine[1024];

    snprintf(tmpLine, sizeof(tmpLine), "cannot open '%s' errno=%d - %s",
             namebuf,
             errno,
             strerror(errno));

    /* FAILURE */

    /* NOTE: log_err may modify errno */

    log_err(errno, id, msg_script_open);

    req_reject(PBSE_INTERNAL, 0, preq, mom_host, tmpLine);

    return;
    }

  if (write(
        fds,
        preq->rq_ind.rq_jobfile.rq_data,
        (unsigned)preq->rq_ind.rq_jobfile.rq_size) != preq->rq_ind.rq_jobfile.rq_size)
    {
    /* FAILURE */

    log_err(errno, id, msg_script_write);

    req_reject(PBSE_INTERNAL, 0, preq, mom_host, "cannot write job command file");

    close(fds);

    return;
    }

  close(fds);

  pj->ji_qs.ji_un.ji_newt.ji_scriptsz += preq->rq_ind.rq_jobfile.rq_size;

  /* job has a script file */

  pj->ji_qs.ji_svrflags =
    (pj->ji_qs.ji_svrflags & ~JOB_SVFLG_CHECKPOINT_FILE) | JOB_SVFLG_SCRIPT;

  /* SUCCESS */

  reply_ack(preq);

  return;
  }  /* END req_jobscript() */
Beispiel #6
0
void req_commit(

  struct batch_request *preq)  /* I */

  {
  unsigned int  momport = 0;
  int           rc;
  job          *pj = locate_new_job(preq->rq_conn, preq->rq_ind.rq_commit);

  if (LOGLEVEL >= 6)
    {
    log_record(
      PBSEVENT_JOB,
      PBS_EVENTCLASS_JOB,
      (pj != NULL) ? pj->ji_qs.ji_jobid : "NULL",
      "committing job");
    }

  if (pj == NULL)
    {
    req_reject(PBSE_UNKJOBID, 0, preq, NULL, NULL);

    return;
    }

  if (pj->ji_qs.ji_substate != JOB_SUBSTATE_TRANSICM)
    {
    log_err(errno, "req_commit", (char *)"cannot commit job in unexpected state");

    req_reject(PBSE_IVALREQ, 0, preq, NULL, NULL);

    return;
    }

  /* move job from new job list to "all" job list, set to running state */

  delete_link(&pj->ji_alljobs);

  alljobs_list.push_back(pj);

  /*
  ** Set JOB_SVFLG_HERE to indicate that this is Mother Superior.
  */

  pj->ji_qs.ji_svrflags |= JOB_SVFLG_HERE;

  pj->ji_qs.ji_state = JOB_STATE_RUNNING;

  pj->ji_qs.ji_substate = JOB_SUBSTATE_PRERUN;

  pj->ji_qs.ji_un_type = JOB_UNION_TYPE_MOM;

  pj->ji_qs.ji_un.ji_momt.ji_svraddr = get_connectaddr(preq->rq_conn,FALSE);

  pj->ji_qs.ji_un.ji_momt.ji_exitstat = 0;

  /* For MOM - start up the job (blocks) */

  if (LOGLEVEL >= 6)
    log_record(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pj->ji_qs.ji_jobid, "req_commit:starting job execution");

  rc = start_exec(pj);

  if (LOGLEVEL >= 6)
    {
    log_record(
      PBSEVENT_JOB,
      PBS_EVENTCLASS_JOB,
      pj->ji_qs.ji_jobid,
      "req_commit:job execution started");
    }

  /* if start request fails, reply with failure string */

  if (pj->ji_qs.ji_substate == JOB_SUBSTATE_EXITING)
    {
    char tmpLine[1024];

    if ((pj->ji_hosts != NULL) &&
        (pj->ji_nodekill >= 0) &&
        (pj->ji_hosts[pj->ji_nodekill].hn_host != NULL))
      {
      sprintf(tmpLine, "start failed on node %s",
              pj->ji_hosts[pj->ji_nodekill].hn_host);
      }
    else
      {
      sprintf(tmpLine, "start failed on unknown node");
      }

    if (LOGLEVEL >= 6)
      {
      log_record(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pj->ji_qs.ji_jobid, tmpLine);
      }

    reply_text(preq, rc, tmpLine);
    }
  else
    {
    reply_sid(preq, pj->ji_wattr[JOB_ATR_session_id].at_val.at_long,BATCH_REPLY_CHOICE_Text);
    }

  if (multi_mom)
    {
    momport = pbs_rm_port;
    }

  job_save(pj, SAVEJOB_FULL, momport);

#ifdef NVIDIA_GPUS
  /*
   * Does this job have a gpuid assigned?
   * if so, then update gpu status
   */
  if ((use_nvidia_gpu) && 
      ((pj->ji_wattr[JOB_ATR_exec_gpus].at_flags & ATR_VFLAG_SET) != 0) &&
      (pj->ji_wattr[JOB_ATR_exec_gpus].at_val.at_str != NULL))
    {
    send_update_soon();
    }
#endif  /* NVIDIA_GPUS */


  /* NOTE: we used to flag JOB_ATR_errpath, JOB_ATR_outpath,
   * JOB_ATR_session_id, and JOB_ATR_altid as modified at this point to make sure
   * pbs_server got these attr values.  This worked fine before TORQUE modified
   * job launched into an async process.  At 2.0.0p6, a new pbs_attribute "SEND" flag
   * was added to handle this process. */

  return;
  }  /* END req_commit() */
Beispiel #7
0
void req_mvjobfile(

  struct batch_request *preq)  /* I */

  {
  int          fds;
  enum job_file  jft;
  int          oflag;
  job         *pj;

  struct passwd *pwd;
  char          *buf = NULL;

  jft = (enum job_file)preq->rq_ind.rq_jobfile.rq_type;

  if (preq->rq_ind.rq_jobfile.rq_sequence == 0)
    oflag = O_CREAT | O_WRONLY | O_TRUNC;
  else
    oflag = O_CREAT | O_WRONLY | O_APPEND;

  pj = locate_new_job(preq->rq_conn, NULL);

  if (pj == NULL)
    pj = mom_find_job(preq->rq_ind.rq_jobfile.rq_jobid);

  if (pj == NULL)
    {
    snprintf(log_buffer, 1024, "cannot find job %s for move of %s file",
      preq->rq_ind.rq_jobfile.rq_jobid,
      TJobFileType[jft]);

    log_err(-1, __func__, log_buffer);

    req_reject(PBSE_UNKJOBID, 0, preq, NULL, NULL);

    return;
    }

  bool good;
  good = check_pwd(pj);
  if ((pj->ji_grpcache == NULL) && 
      (good == false))
    {
    req_reject(PBSE_UNKJOBID, 0, preq, NULL, NULL);

    return;
    }

  /* check_pwd allocated pwd and getpwnam_ext is going to allocate
     another one. Free pwd first */
  if ((pwd = getpwnam_ext(&buf, pj->ji_wattr[JOB_ATR_euser].at_val.at_str)) == NULL)
    {
    /* FAILURE */
    req_reject(PBSE_MOMREJECT, 0, preq, NULL, "password lookup failed");

    return;
    }

  if ((fds = open_std_file(pj, jft, oflag, pwd->pw_gid)) < 0)
    {
    int   keeping = 1;
    char *path = std_file_name(pj, jft, &keeping);

    snprintf(log_buffer,sizeof(log_buffer),
      "Cannot create file %s",
      path);

    req_reject(PBSE_SYSTEM, 0, preq, NULL, log_buffer);

    if (pwd)
      {
      free_pwnam(pwd, buf);
      }
    return;
    }
  if (pwd)
    {
    free_pwnam(pwd, buf);
    }

  if (write_ac_socket(
        fds,
        preq->rq_ind.rq_jobfile.rq_data,
        preq->rq_ind.rq_jobfile.rq_size) != preq->rq_ind.rq_jobfile.rq_size)
    {
    req_reject(PBSE_SYSTEM, 0, preq, NULL, "cannot create file");
    }
  else
    {
    if (LOGLEVEL >= 6)
      {
      sprintf(log_buffer, "successfully moved %s file for job '%s'",
        TJobFileType[jft],
        preq->rq_ind.rq_jobfile.rq_jobid);
      
      log_record(
        PBSEVENT_JOB,
        PBS_EVENTCLASS_JOB,
        pj->ji_qs.ji_jobid,
        log_buffer);
      }

    reply_ack(preq);
    }

  close(fds);

  return;
  }  /* END req_mvjobfile() */