Ejemplo n.º 1
0
void req_commit(

    struct batch_request *preq)  /* I */

{
    unsigned int  momport = 0;
    int           rc;
    job          *pj = locate_new_job(preq->rq_conn, preq->rq_ind.rq_commit);

    if (LOGLEVEL >= 6)
    {
        log_record(
            PBSEVENT_JOB,
            PBS_EVENTCLASS_JOB,
            (pj != NULL) ? pj->ji_qs.ji_jobid : "NULL",
            "committing job");
    }

    if (pj == NULL)
    {
        req_reject(PBSE_UNKJOBID, 0, preq, NULL, NULL);

        return;
    }

    if (pj->ji_qs.ji_substate != JOB_SUBSTATE_TRANSICM)
    {
        log_err(errno, "req_commit", (char *)"cannot commit job in unexpected state");

        req_reject(PBSE_IVALREQ, 0, preq, NULL, NULL);

        return;
    }

    /* move job from new job list to "all" job list, set to running state */

    delete_link(&pj->ji_alljobs);

    append_link(&svr_alljobs, &pj->ji_alljobs, pj);

    /*
    ** Set JOB_SVFLG_HERE to indicate that this is Mother Superior.
    */

    pj->ji_qs.ji_svrflags |= JOB_SVFLG_HERE;

    pj->ji_qs.ji_state = JOB_STATE_RUNNING;

    pj->ji_qs.ji_substate = JOB_SUBSTATE_PRERUN;

    pj->ji_qs.ji_un_type = JOB_UNION_TYPE_MOM;

    pj->ji_qs.ji_un.ji_momt.ji_svraddr = get_connectaddr(preq->rq_conn,FALSE);

    pj->ji_qs.ji_un.ji_momt.ji_exitstat = 0;

    /* For MOM - start up the job (blocks) */

    if (LOGLEVEL >= 6)
    {
        log_record(
            PBSEVENT_JOB,
            PBS_EVENTCLASS_JOB,
            (pj != NULL) ? pj->ji_qs.ji_jobid : "NULL",
            "req_commit:starting job execution");
    }

    rc = start_exec(pj);

    if (LOGLEVEL >= 6)
    {
        log_record(
            PBSEVENT_JOB,
            PBS_EVENTCLASS_JOB,
            pj->ji_qs.ji_jobid,
            "req_commit:job execution started");
    }

    /* if start request fails, reply with failure string */

    if (pj->ji_qs.ji_substate == JOB_SUBSTATE_EXITING)
    {
        char tmpLine[1024];

        if ((pj->ji_hosts != NULL) &&
                (pj->ji_nodekill >= 0) &&
                (pj->ji_hosts[pj->ji_nodekill].hn_host != NULL))
        {
            sprintf(tmpLine, "start failed on node %s",
                    pj->ji_hosts[pj->ji_nodekill].hn_host);
        }
        else
        {
            sprintf(tmpLine, "start failed on unknown node");
        }

        if (LOGLEVEL >= 6)
        {
            log_record(
                PBSEVENT_JOB,
                PBS_EVENTCLASS_JOB,
                (pj != NULL) ? pj->ji_qs.ji_jobid : "NULL",
                tmpLine);
        }

        reply_text(preq, rc, tmpLine);
    }
    else
    {
        reply_sid(preq, pj->ji_wattr[JOB_ATR_session_id].at_val.at_long,BATCH_REPLY_CHOICE_Text);
    }

    if (multi_mom)
    {
        momport = pbs_rm_port;
    }

    job_save(pj, SAVEJOB_FULL, momport);

#ifdef NVIDIA_GPUS
    /*
     * Does this job have a gpuid assigned?
     * if so, then update gpu status
     */
    if ((use_nvidia_gpu) &&
            ((pj->ji_wattr[JOB_ATR_exec_gpus].at_flags & ATR_VFLAG_SET) != 0) &&
            (pj->ji_wattr[JOB_ATR_exec_gpus].at_val.at_str != NULL))
    {
        send_update_soon();
    }
#endif  /* NVIDIA_GPUS */


    /* NOTE: we used to flag JOB_ATR_errpath, JOB_ATR_outpath,
     * JOB_ATR_session_id, and JOB_ATR_altid as modified at this point to make sure
     * pbs_server got these attr values.  This worked fine before TORQUE modified
     * job launched into an async process.  At 2.0.0p6, a new pbs_attribute "SEND" flag
     * was added to handle this process. */

    return;
}  /* END req_commit() */
Ejemplo n.º 2
0
void mom_job_purge(

  job *pjob)  /* I (modified) */

  {
  job_file_delete_info *jfdi;

  jfdi = (job_file_delete_info *)calloc(1, sizeof(job_file_delete_info));

  if (jfdi == NULL)
    {
    log_err(ENOMEM,__func__, (char *)"No space to allocate info for job file deletion");
    return;
    }

#ifdef NVIDIA_GPUS
  /*
   * Did this job have a gpuid assigned?
   * if so, then update gpu status
   */
  if (((pjob->ji_wattr[JOB_ATR_exec_gpus].at_flags & ATR_VFLAG_SET) != 0) &&
      (pjob->ji_wattr[JOB_ATR_exec_gpus].at_val.at_str != NULL))
    {
    send_update_soon();
    }
#endif  /* NVIDIA_GPUS */

  /* initialize struct information */
  if (pjob->ji_flags & MOM_HAS_TMPDIR)
    {
    jfdi->has_temp_dir = TRUE;
    pjob->ji_flags &= ~MOM_HAS_TMPDIR;
    }
  else
    jfdi->has_temp_dir = FALSE;

  strcpy(jfdi->jobid,pjob->ji_qs.ji_jobid);
  strcpy(jfdi->prefix,pjob->ji_qs.ji_fileprefix);

  if ((pjob->ji_wattr[JOB_ATR_checkpoint_dir].at_flags & ATR_VFLAG_SET) &&
      (pjob->ji_wattr[JOB_ATR_checkpoint_name].at_flags & ATR_VFLAG_SET))
    jfdi->checkpoint_dir = strdup(pjob->ji_wattr[JOB_ATR_checkpoint_dir].at_val.at_str);

  jfdi->gid = pjob->ji_qs.ji_un.ji_momt.ji_exgid;
  jfdi->uid = pjob->ji_qs.ji_un.ji_momt.ji_exuid;

  if (thread_unlink_calls == TRUE)
    enqueue_threadpool_request(delete_job_files,jfdi);
  else
    delete_job_files(jfdi);

  /* remove this job from the global queue */
  delete_link(&pjob->ji_jobque);
  delete_link(&pjob->ji_alljobs);

  if (LOGLEVEL >= 6)
    {
    sprintf(log_buffer,"removing job");

    log_record(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buffer);
    }

#if IBM_SP2==2        /* IBM SP PSSP 3.1 */
  unload_sp_switch(pjob);

#endif   /* IBM SP */

  mom_job_free(pjob);

  /* if no jobs are left, check if MOM should be restarted */

  if (((job *)GET_NEXT(svr_alljobs)) == NULL)
    MOMCheckRestart();

  return;
  }  /* END mom_job_purge() */
Ejemplo n.º 3
0
void mom_job_purge(

  job *pjob)  /* I (modified) */

  {
  job_file_delete_info *jfdi;

  jfdi = (job_file_delete_info *)calloc(1, sizeof(job_file_delete_info));

  if (jfdi == NULL)
    {
    log_err(ENOMEM,__func__, (char *)"No space to allocate info for job file deletion");
    return;
    }

#ifdef NVIDIA_GPUS
  /*
   * Did this job have a gpuid assigned?
   * if so, then update gpu status
   */
  if (((pjob->ji_wattr[JOB_ATR_exec_gpus].at_flags & ATR_VFLAG_SET) != 0) &&
      (pjob->ji_wattr[JOB_ATR_exec_gpus].at_val.at_str != NULL))
    {
    send_update_soon();
    }
#endif  /* NVIDIA_GPUS */

  /* initialize struct information */
  if (pjob->ji_flags & MOM_HAS_TMPDIR)
    {
    jfdi->has_temp_dir = TRUE;
    pjob->ji_flags &= ~MOM_HAS_TMPDIR;
    }
  else
    jfdi->has_temp_dir = FALSE;

  strcpy(jfdi->jobid,pjob->ji_qs.ji_jobid);
  strcpy(jfdi->prefix,pjob->ji_qs.ji_fileprefix);

  if ((pjob->ji_wattr[JOB_ATR_checkpoint_dir].at_flags & ATR_VFLAG_SET) &&
      (pjob->ji_wattr[JOB_ATR_checkpoint_name].at_flags & ATR_VFLAG_SET))
    jfdi->checkpoint_dir = strdup(pjob->ji_wattr[JOB_ATR_checkpoint_dir].at_val.at_str);

  jfdi->gid = pjob->ji_qs.ji_un.ji_momt.ji_exgid;
  jfdi->uid = pjob->ji_qs.ji_un.ji_momt.ji_exuid;

  /* remove each pid in ji_job_pid_set from the global_job_sid_set */
  for (job_pid_set_t::const_iterator job_pid_set_iter = pjob->ji_job_pid_set->begin();
       job_pid_set_iter != pjob->ji_job_pid_set->end();
       job_pid_set_iter++)
    {
    /* get pid entry from ji_job_pid_set */
    pid_t job_pid = *job_pid_set_iter;

    /* see if job_pid exists in job_sid set */
    job_pid_set_t::const_iterator it = global_job_sid_set.find(job_pid);
    if (it != global_job_sid_set.end())
      {
      /* remove job_pid from the set */
      global_job_sid_set.erase(it);
      }
    }

  if (thread_unlink_calls == TRUE)
    enqueue_threadpool_request(delete_job_files, jfdi, request_pool);
  else
    delete_job_files(jfdi);

  /* remove this job from the global queue */
  delete_link(&pjob->ji_jobque);
  delete_link(&pjob->ji_alljobs);

  remove_from_exiting_list(pjob);

  if (LOGLEVEL >= 6)
    {
    sprintf(log_buffer,"removing job");

    log_record(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buffer);
    }

#if IBM_SP2==2        /* IBM SP PSSP 3.1 */
  unload_sp_switch(pjob);

#endif   /* IBM SP */

  //We had a request to change the frequency for the job and now that the job is done
  //we want to change the frequency back.
  resource *presc = find_resc_entry(&pjob->ji_wattr[JOB_ATR_resource],
            find_resc_def(svr_resc_def, "cpuclock", svr_resc_size));
  if (presc != NULL)
    {
    std::string beforeFreq;

    nd_frequency.get_frequency_string(beforeFreq);
    if(!nd_frequency.restore_frequency())
      {
      std::string msg = "Failed to restore frequency.";
      log_ext(nd_frequency.get_last_error(),__func__,msg.c_str(),LOG_ERR);
      }
    else
      {
      std::string afterFreq;
      nd_frequency.get_frequency_string(afterFreq);
      std::string msg = "Restored frequency from " + beforeFreq + " to " + afterFreq;
      log_ext(PBSE_CHANGED_CPU_FREQUENCY,__func__, msg.c_str(),LOG_NOTICE);
      }
    }

  mom_job_free(pjob);

  /* if no jobs are left, check if MOM should be restarted */

  if (((job *)GET_NEXT(svr_alljobs)) == NULL)
    MOMCheckRestart();

  return;
  }  /* END mom_job_purge() */