Exemplo n.º 1
0
END_TEST

START_TEST(find_job_by_array_with_removed_record_test)
  {
  int result;
  all_jobs alljobs;

  struct job *test_job1 = job_alloc();
  strcpy(test_job1->ji_qs.ji_jobid, "test_job1");
  result = insert_job(&alljobs,test_job1);
  fail_unless(result == PBSE_NONE, "job insert fail1");

  struct job *test_job2 = job_alloc();
  strcpy(test_job2->ji_qs.ji_jobid, "test_job2");
  result = insert_job(&alljobs,test_job2);
  fail_unless(result == PBSE_NONE, "job insert fail2");
  
  struct job *test_job3 = job_alloc();
  strcpy(test_job3->ji_qs.ji_jobid, "test_job3");
  result = insert_job(&alljobs,test_job3);
  fail_unless(result == PBSE_NONE, "job insert fai3");

  struct job *test_job4 = job_alloc();
  strcpy(test_job4->ji_qs.ji_jobid, "test_job4");
  result = insert_job(&alljobs,test_job4);
  fail_unless(result == PBSE_NONE, "job insert fail4");

  struct job *test_job5 = job_alloc();
  strcpy(test_job5->ji_qs.ji_jobid, "test_job5");
  result = insert_job(&alljobs,test_job5);
  fail_unless(result == PBSE_NONE, "job insert fail5");
  }
Exemplo n.º 2
0
END_TEST

START_TEST(swap_jobs_test)
  {
  all_jobs alljobs;
  struct job *test_job;
  struct job *second_test_job;

  int result;

  test_job = job_alloc();
  second_test_job = job_alloc();
  strcpy(test_job->ji_qs.ji_jobid,"test");
  strcpy(second_test_job->ji_qs.ji_jobid,"second_test");

  result = swap_jobs(&alljobs,NULL,test_job);
  fail_unless(result != PBSE_NONE, "NULL first input job fail");

  result = insert_job_after(&alljobs,test_job,NULL);
  fail_unless(result != PBSE_NONE, "NULL second input job fail");

  insert_job(&alljobs, test_job);
  insert_job(&alljobs, second_test_job);
  result = swap_jobs(&alljobs, test_job,second_test_job);
  fail_unless(result == PBSE_NONE, "swap jobs fail");
  }
Exemplo n.º 3
0
END_TEST

START_TEST(swap_jobs_test)
  {
  struct all_jobs alljobs;
  struct job *test_job;
  struct job *second_test_job;

  int result;
  initialize_all_jobs_array(&alljobs);

  test_job = job_alloc();
  second_test_job = job_alloc();

  result = swap_jobs(&alljobs,NULL,test_job);
  fail_unless(result != PBSE_NONE, "NULL first input job fail");

  result = insert_job_after(&alljobs,test_job,NULL);
  fail_unless(result != PBSE_NONE, "NULL second input job fail");

  insert_job(&alljobs, test_job);
  insert_job(&alljobs, second_test_job);
  result = swap_jobs(&alljobs, test_job,second_test_job);
  fail_unless(result == PBSE_NONE, "swap jobs fail");
  }
Exemplo n.º 4
0
END_TEST


START_TEST(test_remove_some_recycle_jobs)
  {
  job *pjobs[1000];
  initialize_recycler();

  for (int i = 0; i < 1000; i++)
    {
    pjobs[i] = job_alloc();
    fail_unless(insert_into_recycler(pjobs[i]) == PBSE_NONE);

    // make the first 700 get removed 
    if (i < 700)
      pjobs[i]->ji_momstat = 0;
    }

  pthread_t t1;
  pthread_t t2;
  pthread_t t3;

  pthread_create(&t1, NULL, remove_some_recycle_jobs, NULL);
  pthread_create(&t2, NULL, remove_some_recycle_jobs, NULL);
  pthread_create(&t3, NULL, remove_some_recycle_jobs, NULL);

  pthread_join(t1, NULL);
  pthread_join(t2, NULL);
  pthread_join(t3, NULL);

  // 300 should be left
  fail_unless(recycler.rc_jobs.count() == 300);
  }
Exemplo n.º 5
0
END_TEST

START_TEST(insert_job_after_test)
  {
  all_jobs alljobs;
  struct job *test_job;

  int result;

  test_job = job_alloc();
  strcpy(test_job->ji_qs.ji_jobid,"mylittlejob");

  result = insert_job_after(NULL,test_job,test_job);
  fail_unless(result != PBSE_NONE, "insert into null array fail");

  result = insert_job_after(&alljobs,(char *)NULL,test_job);
  fail_unless(result != PBSE_NONE, "NULL job after insert fail");

  result = insert_job_after(&alljobs,test_job,NULL);
  fail_unless(result != PBSE_NONE, "NULL job to insert fail");

  insert_job(&alljobs,test_job);
  result = insert_job_after(&alljobs,test_job,test_job);
  fail_unless(result == PBSE_NONE, "job insert fail");
  }
Exemplo n.º 6
0
END_TEST

START_TEST(insert_job_after_test)
  {
  struct all_jobs alljobs;
  struct job *test_job;

  int result;
  initialize_all_jobs_array(&alljobs);

  test_job = job_alloc();

  result = insert_job_after(NULL,test_job,test_job);
  fail_unless(result != PBSE_NONE, "insert into null array fail");

  result = insert_job_after(&alljobs,NULL,test_job);
  fail_unless(result != PBSE_NONE, "NULL job after insert fail");

  result = insert_job_after(&alljobs,test_job,NULL);
  fail_unless(result != PBSE_NONE, "NULL job to insert fail");

  insert_job(&alljobs,test_job);
  result = insert_job_after(&alljobs,test_job,test_job);
  fail_unless(result == PBSE_NONE, "job insert fail");
  }
Exemplo n.º 7
0
END_TEST


START_TEST(test_pop_job_from_recycler)
  {
  job *pjobs[10];
  initialize_recycler();

  while (recycler.rc_jobs.count() > 0)
    pop_job_from_recycler(&recycler.rc_jobs);

  for (int i = 0; i < 10; i++)
    {
    pjobs[i] = job_alloc();
    fail_unless(insert_into_recycler(pjobs[i]) == PBSE_NONE);
    }

  for (unsigned int i = 0; i < 10; i++)
    {
    job *pjob = pop_job_from_recycler(&recycler.rc_jobs);
    fail_unless(pjob == pjobs[i]);
    fail_unless(recycler.rc_jobs.count() == 9 - i);
    }

  for (int i = 0; i < 3; i++)
    fail_unless(pop_job_from_recycler(&recycler.rc_jobs) == NULL);

  // test for records already freed or that it has not been recycled
  for (int i = 0; i < 5; i++)
    {
    pjobs[i] = job_alloc();
    fail_unless(insert_into_recycler(pjobs[i]) == PBSE_NONE);
    }

  job *pjob = pop_job_from_recycler(&recycler.rc_jobs);
  int count = 0;
  char buf[80];
  while(pjob)
    {
    count++;
    snprintf(buf,sizeof(buf),"%016lx",(long)pjob);
    fail_unless(strcmp(pjob->ji_qs.ji_jobid, buf)==0, pjob->ji_qs.ji_jobid);
    pjob = pop_job_from_recycler(&recycler.rc_jobs);
    }

  fail_unless(count == 5, "count of valid recycled jobs were wrong: %d", count);
  }
Exemplo n.º 8
0
END_TEST

START_TEST(job_alloc_test)
  {
  struct job *result = NULL;

  result = job_alloc();
  fail_unless(result != NULL, "job was not allocated");
  }
Exemplo n.º 9
0
END_TEST /* handle_aborted_job_test */

START_TEST(job_alloc_test)
  {
  struct job *result = NULL;

  result = job_alloc();
  fail_unless(result != NULL, "job was not allocated");
  }
Exemplo n.º 10
0
job *job_recov(

  char *filename) /* I */   /* pathname to job save file */

  {
  job  *pj;
  char  namebuf[MAXPATHLEN];
  char  log_buf[LOCAL_LOG_BUF_SIZE];
  int   rc;

  pj = job_alloc(); /* allocate & initialize job structure space */

  if (pj == NULL)
    {
    /* FAILURE - cannot alloc memory */

    return(NULL);
    }

  snprintf(namebuf, MAXPATHLEN, "%s%s", path_jobs, filename); /* job directory path, filename */
  size_t logBufLen = sizeof(log_buf);

  if ((rc = job_recov_xml(namebuf, &pj, log_buf, logBufLen)) && rc == PBSE_INVALID_SYNTAX)
    rc = job_recov_binary(namebuf, &pj, log_buf, logBufLen);

  if (rc == PBSE_NONE)
    rc = set_array_job_ids(&pj, log_buf, logBufLen);

  if (rc != PBSE_NONE) 
    {
    if (rc == -1) 
      {
      log_err(errno, __func__, log_buf);

#ifndef PBS_MOM
      unlock_ji_mutex(pj, __func__, "1", LOGLEVEL);
      free(pj->ji_mutex);
#endif
      free((char *)pj);
      } /* sometime pjob is freed by abt_job() */
    return(NULL);
    }
  
  
  pj->ji_commit_done = 1;

  /* all done recovering the job */

#ifdef PBS_MOM
  job_save(pj, SAVEJOB_FULL, (multi_mom == 0)?0:pbs_rm_port);
#else
  job_save(pj, SAVEJOB_FULL, 0);
#endif

  return(pj);
  }  /* END job_recov() */
Exemplo n.º 11
0
END_TEST

START_TEST(cleanup_restart_file_test)
  {
  struct job* test_job = NULL;
  cleanup_restart_file(test_job); /*TODO: add a kind of assert*/

  test_job = job_alloc();
  cleanup_restart_file(test_job);
  }
Exemplo n.º 12
0
END_TEST

START_TEST(remove_checkpoint_test)
  {
  struct job* test_job = NULL;
  remove_checkpoint(NULL); /*TODO: add a kind of assert*/
  remove_checkpoint(&test_job);

  test_job = job_alloc();
  remove_checkpoint(&test_job);
  }
Exemplo n.º 13
0
END_TEST

START_TEST(svr_job_purge_test)
  {
  struct job* test_job = NULL;
  int result = svr_job_purge(test_job);
  fail_unless(result != 0, "NULL job input fail");

  test_job = job_alloc();
  svr_job_purge(test_job);
  fail_unless(result >= -1, "empty job input fail: %d", result);/*TODO: fix -1 via log_job_record mock*/
  }
Exemplo n.º 14
0
END_TEST

START_TEST(copy_job_test)
  {
  struct job* result = copy_job(NULL);
  struct job *parent = job_alloc();
  struct job *child = NULL;
  fail_unless(result == NULL, "NULL input check fail");

  child = copy_job(parent);
  fail_unless(child != NULL, "job was not copied");
  /* TODO: add check for correctness of the copy */
  }
Exemplo n.º 15
0
END_TEST

START_TEST(job_free_test)
  {
  struct job *result = NULL;
  /* NULL value test */
  job_free(result,0);

  result = job_alloc();
  job_free(result,0);
  /* TODO: add some check for the free operation
     or refactor function interface, suggestions:
     1) pointer to pointer pass, then null the actual pointer
     2) return value, could be a code or the actual ptr from 1*/
  }
Exemplo n.º 16
0
END_TEST

START_TEST(get_jobs_array_test)
  {
  struct job *test_job = NULL;

  struct job_array *result = get_jobs_array(NULL);
  fail_unless(result == NULL, "NULL input pointer to pointer to job fail");

  result = get_jobs_array(&test_job);
  fail_unless(result == NULL, "NULL input pointer to job fail");

  test_job = job_alloc();
  result = get_jobs_array(&test_job);
  fail_unless(result == NULL, "get job array fail");
  }
Exemplo n.º 17
0
END_TEST

START_TEST(get_jobs_queue_test)
  {
  struct job *test_job = NULL;

  struct pbs_queue *result = get_jobs_queue(NULL);
  fail_unless(result == NULL, "NULL input pointer to pointer to job fail");

  result = get_jobs_queue(&test_job);
  fail_unless(result == NULL, "NULL input pointer to job fail");

  test_job = job_alloc();
  result = get_jobs_queue(&test_job);
  fail_unless(result == NULL, "get job queue fail");
  }
Exemplo n.º 18
0
END_TEST


START_TEST(delete_dependency_job_test)
  {
  job           *pjob = job_alloc();
  batch_request  preq;

  memset(&preq, 0, sizeof(preq));
  strcpy(preq.rq_ind.rq_register.rq_parent, job1);
  strcpy(preq.rq_ind.rq_register.rq_child, job1);

  fail_unless(delete_dependency_job(&preq, &pjob) == PBSE_IVALREQ);
  strcpy(preq.rq_ind.rq_register.rq_child, job2);
  fail_unless(delete_dependency_job(&preq, &pjob) == PBSE_NONE);
  fail_unless(pjob == NULL);
  }
Exemplo n.º 19
0
END_TEST

START_TEST(remove_job_test)
  {
  all_jobs alljobs;
  int result;
  struct job *test_job = job_alloc();

  result = remove_job(NULL,test_job);
  fail_unless(result != PBSE_NONE, "remove from null array fail");

  result = remove_job(&alljobs,NULL);
  fail_unless(result != PBSE_NONE, "NULL job remove fail");

  insert_job(&alljobs,test_job);
  result = remove_job(&alljobs,test_job);
  fail_unless(result == PBSE_NONE, "job remove fail");
  }
Exemplo n.º 20
0
END_TEST

START_TEST(insert_job_first_test)
  {
  all_jobs alljobs;
  struct job *test_job = job_alloc();

  int result;

  result = insert_job_first(NULL,test_job);
  fail_unless(result != PBSE_NONE, "insert into null array fail");

  result = insert_job_first(&alljobs,NULL);
  fail_unless(result != PBSE_NONE, "NULL job insert fail");

  result = insert_job_first(&alljobs,test_job);
  fail_unless(result == PBSE_NONE, "job insert fail");
  }
Exemplo n.º 21
0
END_TEST

START_TEST(has_job_test)
  {
  all_jobs alljobs;
  struct job *test_job = job_alloc();
  int result;


  result = has_job(NULL,test_job);
  fail_unless(result != PBSE_NONE, "null input array fail");

  result = has_job(&alljobs,NULL);
  fail_unless(result != PBSE_NONE, "NULL input job fail");

  insert_job(&alljobs, test_job);
  result = has_job(&alljobs, test_job);
  fail_unless(result == TRUE, "has_job fail");
  }
Exemplo n.º 22
0
END_TEST

START_TEST(get_jobs_index_test)
  {
  struct all_jobs alljobs;
  struct job *test_job = job_alloc();
  int result;
  initialize_all_jobs_array(&alljobs);

  result = get_jobs_index(NULL,test_job);
  fail_unless(result == -1 * PBSE_BAD_PARAMETER, "null input array fail");

  result = get_jobs_index(&alljobs,NULL);
  fail_unless(result == -1 * PBSE_BAD_PARAMETER, "NULL input job fail");

  insert_job(&alljobs, test_job);
  result = get_jobs_index(&alljobs, test_job);
  fail_unless(result == PBSE_NONE, "get_jobs_index fail");
  }
Exemplo n.º 23
0
END_TEST

START_TEST(cpy_checkpoint_test)
  {
  struct job *test_job = job_alloc();
  struct batch_request *result = cpy_checkpoint(NULL,
                                         test_job,
                                         JOB_ATR_checkpoint_name,
                                         CKPT_DIR_IN);
  struct batch_request *initial = alloc_br(/*PBS_BATCH_CheckpointJob*/0);
  fail_unless(result == NULL, "NULL batch_request input fail");

  result = cpy_checkpoint(initial,
                          NULL,
                          JOB_ATR_checkpoint_name,
                          CKPT_DIR_IN);
  fail_unless(result == NULL, "NULL job input fail");

  /*TODO: add test for valid input, invalid dir value*/
  }
Exemplo n.º 24
0
END_TEST

START_TEST(svr_job_purge_test)
  {
  struct job* test_job = NULL;
  int result = svr_job_purge(test_job);
  fail_unless(result != 0, "NULL job input fail");

  called_remove_job = 0;
  dequejob_rc = PBSE_JOB_NOT_IN_QUEUE;
  test_job = job_alloc();
  test_job->ji_qs.ji_substate = JOB_SUBSTATE_QUEUED;
  test_job->ji_qs.ji_state = JOB_STATE_QUEUED;
  result = svr_job_purge(test_job);
  fail_unless(result == 0, "non-queued job fail", result);
  // called_remove_job once means we didn't call job_free
  fail_unless(called_remove_job == 1);
  
  dequejob_rc = 0;
  result = svr_job_purge(test_job);
  fail_unless(result == 0, "queued job fail: %d", result);
  // Calling remove_job twice means we did call job_free
  fail_unless(called_remove_job == 3);
  }
Exemplo n.º 25
0
void mom_req_quejob(

  batch_request *preq) /* ptr to the decoded request   */

  {
  char           basename[PBS_JOBBASE + 1];
  int            created_here = 0;
  int            index;
  char          *jid;
  attribute_def *pdef;
  job           *pj;
  svrattrl      *psatl;
  int            rc;
  int            sock = preq->rq_conn;

  int            IsCheckpoint = 0;
  /* set basic (user) level access permission */
  int            resc_access_perm = ATR_DFLAG_USWR | ATR_DFLAG_Creat;

  memset(basename, 0, sizeof(basename));

  if (PBSNodeCheckProlog)
    {
    check_state(1);

    if (internal_state & INUSE_DOWN)
      {
      req_reject(PBSE_BADMOMSTATE, 0, preq, NULL, NULL);

      return;
      }
    }

  if (reject_job_submit == TRUE)
    {
    req_reject(-1, 0, preq, NULL, "This mom is configured not to run jobs");
    return;
    }

  if (preq->rq_fromsvr)
    {
    /* from another server - accept the extra attributes */

    resc_access_perm |= ATR_DFLAG_MGWR | ATR_DFLAG_SvWR | ATR_DFLAG_MOM;

    jid = preq->rq_ind.rq_queuejob.rq_jid;
    }
  else
    {
    /* request must be from server */

    log_err(errno, __func__, (char *)"request not from server");

    req_reject(PBSE_IVALREQ, 0, preq, NULL, "request not received from server");

    return;
    }

  /* does job already exist, check both old and new jobs */

  if ((pj = mom_find_job(jid)) == NULL)
    {
    pj = (job *)GET_NEXT(svr_newjobs);

    while (pj != NULL)
      {
      if (!strcmp(pj->ji_qs.ji_jobid, jid))
        break;

      pj = (job *)GET_NEXT(pj->ji_alljobs);
      }
    }

  /*
   * New job ...
   *
   * for MOM - rather than make up a hashname, we use the name sent
   * to us by the server as an pbs_attribute.
   */

  psatl = (svrattrl *)GET_NEXT(preq->rq_ind.rq_queuejob.rq_attr);

  while (psatl != NULL)
    {
    if (!strcmp(psatl->al_name,ATTR_hashname))
      {
      snprintf(basename, sizeof(basename), "%s", psatl->al_value);

      break;
      }

    psatl = (svrattrl *)GET_NEXT(psatl->al_link);
    }

  if (basename[0] == '\0')
    snprintf(basename, sizeof(basename), "%s", jid);

  if (pj != NULL)
    {
    /* newly queued job already exists */

    if (pj->ji_qs.ji_substate == JOB_SUBSTATE_RUNNING)
      {
      /* FAILURE - job exists and is running */

      log_err(errno, __func__, (char *)"cannot queue new job, job exists and is running");

      req_reject(PBSE_JOBEXIST, 0, preq, NULL, "job is running");

      return;
      }

    /* if checkpointed, then keep old and skip rest of process */

    if (pj->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE)
      {
      IsCheckpoint = 1;
      }  /* END if (pj->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) */
    else
      {
      /* reject the job. It is already working here. */
      sprintf(log_buffer, "Job already exists. State: %d substate: %d", pj->ji_qs.ji_state, pj->ji_qs.ji_substate);
      log_err(-1, __func__, log_buffer);
      sprintf(log_buffer, "Job %s already on mom", pj->ji_qs.ji_jobid);
      req_reject(PBSE_JOBEXIST, 0, preq, NULL, log_buffer);
      return;
      }
    }  /* END if (pj != NULL) */
  else
    {
    /* if not already here, allocate job struct */

    if ((pj = job_alloc()) == NULL)
      {
      /* FAILURE */

      req_reject(PBSE_MEM_MALLOC, 0, preq, NULL, "cannot allocate new job structure");

      return;
      }
    }    /* END else (pj != NULL) */

  if (IsCheckpoint == 0)
    {
    strcpy(pj->ji_qs.ji_jobid,jid);

    strcpy(pj->ji_qs.ji_fileprefix,basename);

    pj->ji_modified       = 1;

    pj->ji_qs.ji_svrflags = created_here;

    pj->ji_qs.ji_un_type  = JOB_UNION_TYPE_NEW;

    /* changing the union type overwrites the euid for the job, and if
     * ji_grpcache is set this potentially allows jobs to run as root. Unsetting
     * ji_grpcache fixes this problem --dbeer */
    if (pj->ji_grpcache != NULL)
      {
      free(pj->ji_grpcache);
      pj->ji_grpcache = NULL;
      }
    }

  /* decode attributes from request into job structure */

  psatl = (svrattrl *)GET_NEXT(preq->rq_ind.rq_queuejob.rq_attr);

  while (psatl != NULL)
    {
    if (IsCheckpoint == 1)
      {
      if (strcmp(psatl->al_name,ATTR_checkpoint_name) &&
          strcmp(psatl->al_name,ATTR_v))
        {
        psatl = (svrattrl *)GET_NEXT(psatl->al_link);

        continue;
        }
      }

    /* identify the pbs_attribute by name */

    index = find_attr(job_attr_def,psatl->al_name,JOB_ATR_LAST);

    if (index < 0)
      {
      /* FAILURE */

      /* didn`t recognize the name */

      mom_job_purge(pj);   /* CRI - 12/20/2004 */

      reply_badattr(PBSE_NOATTR, 1, psatl, preq);

      return;
      }

    pdef = &job_attr_def[index];

    /* Is pbs_attribute not writeable by manager or by a server? */

    if ((pdef->at_flags & resc_access_perm) == 0)
      {
      /* FAILURE */

      mom_job_purge(pj);

      reply_badattr(PBSE_ATTRRO, 1, psatl, preq);

      return;
      }

    /* decode pbs_attribute */

    if (!strcmp(psatl->al_name,ATTR_v))
      {
      rc = decode_arst_merge(
             &pj->ji_wattr[index],
             psatl->al_name,
             psatl->al_resc,
             psatl->al_value);
      }
    else
      {
      rc = pdef->at_decode(
             &pj->ji_wattr[index],
             psatl->al_name,
             psatl->al_resc,
             psatl->al_value,
             resc_access_perm);
      }

    if (rc != 0)
      {
      /* FAILURE */

      /* all errors are fatal for MOM */

      mom_job_purge(pj);

      reply_badattr(rc, 1, psatl, preq);

      return;
      }

    if (psatl->al_op == DFLT)
      {
      if (psatl->al_resc)
        {
        resource     *presc;
        resource_def *prdef;

        prdef = find_resc_def(svr_resc_def,psatl->al_resc,svr_resc_size);

        if (prdef == NULL)
          {
          mom_job_purge(pj);

          reply_badattr(rc, 1, psatl, preq);

          return;
          }

        presc = find_resc_entry(&pj->ji_wattr[index],prdef);

        if (presc != NULL)
          presc->rs_value.at_flags |= ATR_VFLAG_DEFLT;
        }
      else
        {
        pj->ji_wattr[index].at_flags |= ATR_VFLAG_DEFLT;
        }
      }    /* END if (psatl->al_op == DFLT) */

    psatl = (svrattrl *)GET_NEXT(psatl->al_link);
    }      /* END while (psatl != NULL) */

  if (IsCheckpoint == 1)
    {
    pj->ji_qs.ji_substate = JOB_SUBSTATE_TRANSIN;

    if (reply_jobid(preq,pj->ji_qs.ji_jobid,BATCH_REPLY_CHOICE_Queue) == 0)
      {
      remove_from_job_list(pj);

      append_link(&svr_newjobs,&pj->ji_alljobs,pj);

      if (pj->ji_grpcache != NULL)
        {
        free(pj->ji_grpcache);
        pj->ji_grpcache = NULL;
        }

      pj->ji_qs.ji_un_type = JOB_UNION_TYPE_NEW;
      pj->ji_qs.ji_un.ji_newt.ji_fromsock = sock;
      pj->ji_qs.ji_un.ji_newt.ji_fromaddr = get_connectaddr(sock,FALSE);
      pj->ji_qs.ji_un.ji_newt.ji_scriptsz = 0;

      /* Per Eric R., req_mvjobfile was giving error in open_std_file,
         showed up as fishy error message */

      if (pj->ji_grpcache != NULL)
        {
        free(pj->ji_grpcache);
        pj->ji_grpcache = NULL;
        }
      }
    else
      {
      close_conn(sock, FALSE);
      }

    /* SUCCESS */

    return;
    }

  /* set remaining job structure elements */
  pj->ji_qs.ji_state =    JOB_STATE_TRANSIT;
  pj->ji_qs.ji_substate = JOB_SUBSTATE_TRANSIN;

  pj->ji_wattr[JOB_ATR_mtime].at_val.at_long = (long)time_now;
  pj->ji_wattr[JOB_ATR_mtime].at_flags |= ATR_VFLAG_SET;

  if (pj->ji_grpcache != NULL)
    {
    free(pj->ji_grpcache);
    pj->ji_grpcache = NULL;
    }

  pj->ji_qs.ji_un_type = JOB_UNION_TYPE_NEW;
  pj->ji_qs.ji_un.ji_newt.ji_fromsock = sock;
  pj->ji_qs.ji_un.ji_newt.ji_fromaddr = get_connectaddr(sock,FALSE);
  pj->ji_qs.ji_un.ji_newt.ji_scriptsz = 0;

  /* acknowledge the request with the job id */
  if (reply_jobid(preq, pj->ji_qs.ji_jobid, BATCH_REPLY_CHOICE_Queue) != 0)
    {
    /* reply failed, purge the job and close the connection */
    // call mom_job_purge first so that double-frees don't happen 
    // when the on_close function is called
    mom_job_purge(pj);

    close_conn(sock, FALSE);

    return;
    }

  /* link job into server's new jobs list request  */
  append_link(&svr_newjobs, &pj->ji_alljobs, pj);

  return;
  }  /* END mom_req_quejob() */
Exemplo n.º 26
0
void*
job_or_resv_recov_fs(char *filename, int objtype)
{
	int		 fds;
	job		*pj;
	void		*pobj = NULL;
#ifndef PBS_MOM
	resc_resv	*presv;
#endif
	void		*p_fixed = NULL;
	int		fixed_size;
	char		*prefix = NULL;
	char		*path = NULL;
	char		*err_msg;
	char		*ptcs;		/*text control string for err msg*/
	char		*pobjID = NULL;
	char		*pn;		/*name of the file "root" (prefix)*/
	attribute	*wattr = NULL;
	attribute_def	*p_attr_def = NULL;
	int		final_attr;
	int		attr_unkn;
	char		namebuf[MAXPATHLEN];
	char		err_buf[80];

	if (objtype == RESC_RESV_OBJECT) {

#ifndef PBS_MOM		/*MOM doesn't know about resource reservations*/
		presv = resc_resv_alloc();   /* allocate & init resc_rescv struct */
		if (presv == (resc_resv *)0) {
			return ((void *)0);
		}
		pobj = (void *)presv;
		path = path_resvs;
		err_msg = "error opening reservation file";
		ptcs = "reservation Id %s does not match file name for %s";
		pobjID = presv->ri_qs.ri_resvID;
		p_fixed = (void *)&presv->ri_qs;
		fixed_size = sizeof(struct resvfix);
		prefix = presv->ri_qs.ri_fileprefix;
		p_attr_def = resv_attr_def;
		wattr = presv->ri_wattr;
		attr_unkn = RESV_ATR_UNKN;
		final_attr = RESV_ATR_LAST;
#else	/* PBS_MOM only: This will never come here for MOM!!! */
		return ((void *)0);
#endif

	} else {

		pj = job_alloc();           /* allocate & initialize job struct */
		if (pj == (job *)0) {
			return ((void *)0);
		}
		pobj = (void *)pj;
		path = path_jobs;
		err_msg = "error opening job file";
		ptcs = "Job Id %s does not match file name for %s";
		pobjID = pj->ji_qs.ji_jobid;
		p_fixed = (void *)&pj->ji_qs;
		fixed_size = sizeof(struct jobfix);
		if (*pj->ji_qs.ji_fileprefix != '\0')
			prefix = pj->ji_qs.ji_fileprefix;
		else
			prefix = pj->ji_qs.ji_jobid;
		p_attr_def = job_attr_def;
		wattr = pj->ji_wattr;
		attr_unkn = JOB_ATR_UNKN;
		final_attr = JOB_ATR_LAST;
	}

	(void)strcpy(namebuf, path);	/* job (reservation) directory path */
	(void)strcat(namebuf, filename);
#ifdef WIN32
	fix_perms(namebuf);
#endif
	fds = open(namebuf, O_RDONLY, 0);
	if (fds < 0) {
		sprintf(log_buffer, "%s on %s", err_msg, namebuf);
		log_err(errno, "job_or_resv_recov", log_buffer);
		free((char *)pobj);
		return ((void *)0);
	}
#ifdef WIN32
	setmode(fds, O_BINARY);
#endif

	/* read in job or resc_resv quick save sub-structure */

	if (read(fds, (char *)p_fixed, fixed_size) != fixed_size) {
		(void)sprintf(err_buf, "problem reading %s", namebuf);
		log_err(errno, "job_or_resv_recov", err_buf);
		free((char *)pobj);
		(void)close(fds);
		return ((void *)0);
	}
	/* Does file name match the internal name? */
	/* This detects ghost files */

#ifdef WIN32
	pn = strrchr(namebuf, (int)'/');
	if (pn == NULL)
		pn = strrchr(namebuf, (int)'\\');
	if (pn == NULL) {
		sprintf(log_buffer, "bad path %s", namebuf);
		log_err(errno, "job_or_resv_recov", log_buffer);
		free((char *)pj);
		(void)close(fds);
		return ((job *)0);
	}
	pn++;
#else
	pn = strrchr(namebuf, (int)'/') + 1;
#endif

	if (strncmp(pn, prefix, strlen(prefix)) != 0) {
		/* mismatch, discard job (reservation) */

		(void)sprintf(log_buffer, ptcs, pobjID, namebuf);
		log_err(-1, "job_or_resv_recov", log_buffer);
		free((char *)pobj);
		(void)close(fds);
		return ((void *)0);
	}

	/* read in working attributes */

	if (recov_attr_fs(fds, pobj, p_attr_def, wattr,
		final_attr, attr_unkn) != 0) {

		log_err(errno, "job_or_resv_recov", "error from recov_attr");
		if (objtype == RESC_RESV_OBJECT) {

#ifndef PBS_MOM		/*MOM doesn't know about resource reservations*/
			resv_free((resc_resv *)pobj);
#endif
		} else {
			job_free((job *)pobj);
		}

		(void)close(fds);
		return ((void *)0);
	}

	(void)close(fds);

#if defined(PBS_MOM) && defined(WIN32)
	/* get a handle to the job (may not exist) */
	pj->ji_hJob = OpenJobObject(JOB_OBJECT_ALL_ACCESS, FALSE,
		pj->ji_qs.ji_jobid);
#endif

	/* all done recovering the job (reservation) */

	return (pobj);
}
Exemplo n.º 27
0
job *
job_recov_fs(char *filename, int recov_subjob)
{
	int		 fds;
	char		 basen[MAXPATHLEN+1];
	job		*pj;
	char		*pn;
	char		*psuffix;


	pj = job_alloc();	/* allocate & initialize job structure space */
	if (pj == (job *)0) {
		return ((job *)0);
	}

	(void)strcpy(pbs_recov_filename, path_jobs);	/* job directory path */
	(void)strcat(pbs_recov_filename, filename);
#ifdef WIN32
	fix_perms(pbs_recov_filename);
#endif

	/* change file name in case recovery fails so we don't try same file */

	(void)strcpy(basen, pbs_recov_filename);
	psuffix = basen + strlen(basen) - strlen(JOB_BAD_SUFFIX);
	(void)strcpy(psuffix, JOB_BAD_SUFFIX);
#ifdef WIN32
	if (MoveFileEx(pbs_recov_filename, basen,
		MOVEFILE_REPLACE_EXISTING | MOVEFILE_WRITE_THROUGH) == 0) {
		errno = GetLastError();
		sprintf(log_buffer, "MoveFileEx(%s, %s) failed!",
			pbs_recov_filename, basen);
		log_err(errno, "nodes", log_buffer);

	}
	secure_file(basen, "Administrators",
		READS_MASK|WRITES_MASK|STANDARD_RIGHTS_REQUIRED);
#else
	if (rename(pbs_recov_filename, basen) == -1) {
		sprintf(log_buffer, "error renaming job file %s",
			pbs_recov_filename);
		log_err(errno, "job_recov", log_buffer);
		free((char *)pj);
		return ((job *)0);
	}
#endif

	fds = open(basen, O_RDONLY, 0);
	if (fds < 0) {
		sprintf(log_buffer, "error opening of job file %s",
			pbs_recov_filename);
		log_err(errno, "job_recov", log_buffer);
		free((char *)pj);
		return ((job *)0);
	}
#ifdef WIN32
	setmode(fds, O_BINARY);
#endif

	/* read in job fixed sub-structure */

	errno = -1;
	if (read(fds, (char *)&pj->ji_qs, fixedsize) != (int)fixedsize) {
		sprintf(log_buffer, "error reading fixed portion of %s",
			pbs_recov_filename);
		log_err(errno, "job_recov", log_buffer);
		free((char *)pj);
		(void)close(fds);
		return ((job *)0);
	}
	/* Does file name match the internal name? */
	/* This detects ghost files */

#ifdef WIN32
	pn = strrchr(pbs_recov_filename, (int)'/');
	if (pn == NULL)
		pn = strrchr(pbs_recov_filename, (int)'\\');
	if (pn == NULL) {
		sprintf(log_buffer, "bad path %s", pbs_recov_filename);
		log_err(errno, "job_recov", log_buffer);
		free((char *)pj);
		(void)close(fds);
		return ((job *)0);
	}
	pn++;
#else
	pn = strrchr(pbs_recov_filename, (int)'/') + 1;
#endif

	if (strncmp(pn, pj->ji_qs.ji_jobid, strlen(pn)-3) != 0) {
		/* mismatch, discard job */

		(void)sprintf(log_buffer,
			"Job Id %s does not match file name for %s",
			pj->ji_qs.ji_jobid,
			pbs_recov_filename);
		log_err(-1, "job_recov", log_buffer);
		free((char *)pj);
		(void)close(fds);
		return ((job *)0);
	}

	/* unless directed, don't recover Array Sub jobs */

	if ((pj->ji_qs.ji_svrflags & JOB_SVFLG_SubJob) &&
		(recov_subjob == NO_RECOV_SUBJOB)) {
		free((char *)pj);
		(void)close(fds);
		return ((job *)0);
	}

	/* read in extended save area depending on VERSION */

	errno = -1;
	DBPRT(("Job save version %d\n", pj->ji_qs.ji_jsversion))
	if (pj->ji_qs.ji_jsversion < JSVERSION_514) {
		/* If really old version, it wasn't there, abort out */
		sprintf(log_buffer,
			"Job structure version cannot be recovered for job %s",
			pbs_recov_filename);
		log_err(errno, "job_recov", log_buffer);
		free((char *)pj);
		(void)close(fds);
		return ((job *)0);
	} else if (pj->ji_qs.ji_jsversion < JSVERSION_80) {
		/* If older version, read and copy extended area     */
		if (recov_514_extend(fds, pj) != 0) {
			sprintf(log_buffer,
				"error reading extended portion"
				" of %s for prior version",
				pbs_recov_filename);
			log_err(errno, "job_recov", log_buffer);
			free((char *)pj);
			(void)close(fds);
			return ((job *)0);
		}
	} else {
		/* If current version, JSVERSION_80, read into place */
		if (read(fds, (char *)&pj->ji_extended,
			sizeof(union jobextend)) !=
			sizeof(union jobextend)) {
			sprintf(log_buffer,
				"error reading extended portion of %s",
				pbs_recov_filename);
			log_err(errno, "job_recov", log_buffer);
			free((char *)pj);
			(void)close(fds);
			return ((job *)0);
		}
	}
#ifndef PBS_MOM
	if (pj->ji_qs.ji_svrflags & JOB_SVFLG_ArrayJob) {
		size_t xs;

		if (read(fds, (char *)&xs, sizeof(xs)) != sizeof(xs)) {
			sprintf(log_buffer,
				"error reading array section of %s",
				pbs_recov_filename);
			log_err(errno, "job_recov", log_buffer);
			free((char *)pj);
			(void)close(fds);
			return ((job *)0);
		}
		if ((pj->ji_ajtrk = (struct ajtrkhd *)malloc(xs)) == NULL) {
			free((char *)pj);
			(void)close(fds);
			return ((job *)0);
		}
		read(fds, (char *)pj->ji_ajtrk + sizeof(xs), xs - sizeof(xs));
		pj->ji_ajtrk->tkm_size = xs;
	}
#endif	/* not PBS_MOM */

	/* read in working attributes */

	if (recov_attr_fs(fds, pj, job_attr_def, pj->ji_wattr, (int)JOB_ATR_LAST,
		(int)JOB_ATR_UNKN) != 0) {
		sprintf(log_buffer, "error reading attributes portion of %s",
			pbs_recov_filename);
		log_err(errno, "job_recov", log_buffer);
		job_free(pj);
		(void)close(fds);
		return ((job *)0);
	}
	(void)close(fds);

#if defined(PBS_MOM) && defined(WIN32)
	/* get a handle to the job (may not exist) */
	pj->ji_hJob = OpenJobObject(JOB_OBJECT_ALL_ACCESS, FALSE,
		pj->ji_qs.ji_jobid);
#endif

	/* all done recovering the job, change file name back to .JB */

#ifdef WIN32
	if (MoveFileEx(basen, pbs_recov_filename,
		MOVEFILE_REPLACE_EXISTING | MOVEFILE_WRITE_THROUGH) == 0) {
		errno = GetLastError();
		sprintf(log_buffer, "MoveFileEx(%s, %s) failed!",
			basen, pbs_recov_filename);
		log_err(errno, "nodes", log_buffer);

	}
	secure_file(pbs_recov_filename, "Administrators",
		READS_MASK|WRITES_MASK|STANDARD_RIGHTS_REQUIRED);
#else
	(void)rename(basen, pbs_recov_filename);
#endif

	return (pj);
}
Exemplo n.º 28
0
void req_quejob(

  struct batch_request *preq) /* ptr to the decoded request   */

  {
  char  *id = "req_quejob";

  char   basename[PBS_JOBBASE + 1];
  int    created_here = 0;
  int    index;
  char  *jid;
  attribute_def *pdef;
  job   *pj;
  svrattrl *psatl;
  int    rc;
  int    sock = preq->rq_conn;

  int    IsCheckpoint = 0;

  /* set basic (user) level access permission */

  resc_access_perm = ATR_DFLAG_USWR | ATR_DFLAG_Creat;

  if (PBSNodeCheckProlog)
    {
    check_state(1);

    mom_server_all_update_stat();

    if (internal_state & INUSE_DOWN)
      {
      req_reject(PBSE_MOMREJECT,0,preq,NULL,NULL);

      return;
      }
    }

  if (preq->rq_fromsvr)
    {
    /* from another server - accept the extra attributes */

    resc_access_perm |= ATR_DFLAG_MGWR | ATR_DFLAG_SvWR | ATR_DFLAG_MOM;

    jid = preq->rq_ind.rq_queuejob.rq_jid;
    }
  else
    {
    /* request must be from server */

    log_err(errno, id, "request not from server");

    req_reject(PBSE_IVALREQ, 0, preq, NULL, "request not received from server");

    return;
    }

  /* does job already exist, check both old and new jobs */

  if ((pj = find_job(jid)) == NULL)
    {
    pj = (job *)GET_NEXT(svr_newjobs);

    while (pj != NULL)
      {
      if (!strcmp(pj->ji_qs.ji_jobid, jid))
        break;

      pj = (job *)GET_NEXT(pj->ji_alljobs);
      }
    }

  /*
   * New job ...
   *
   * for MOM - rather than make up a hashname, we use the name sent
   * to us by the server as an attribute.
   */

  psatl = (svrattrl *)GET_NEXT(preq->rq_ind.rq_queuejob.rq_attr);

  while (psatl != NULL)
    {
    if (!strcmp(psatl->al_name,ATTR_hashname))
      {
      strcpy(basename,psatl->al_value);

      break;
      }

    psatl = (svrattrl *)GET_NEXT(psatl->al_link);
    }

  if (pj != NULL)
    {
    /* newly queued job already exists */

    if (pj->ji_qs.ji_substate == JOB_SUBSTATE_RUNNING)
      {
      /* FAILURE - job exists and is running */

      log_err(errno,id,"cannot queue new job, job exists and is running");

      req_reject(PBSE_JOBEXIST,0,preq,NULL,"job is running");

      return;
      }

    /* if checkpointed, then keep old and skip rest of process */

    if (pj->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE)
      {
      IsCheckpoint = 1;
      }  /* END if (pj->ji_qs.ji_svrflags & JOB_SVFLG_CHECKPOINT_FILE) */
    else
      {
      /* unlink job from svr_alljobs since it will be placed on newjobs */

      delete_link(&pj->ji_alljobs);
      }
    }  /* END if (pj != NULL) */
  else
    {
    /* if not already here, allocate job struct */

    if ((pj = job_alloc()) == NULL)
      {
      /* FAILURE */

      req_reject(PBSE_SYSTEM, 0, preq, NULL, "cannot allocate new job structure");

      return;
      }
    }    /* END else (pj != NULL) */

  if (IsCheckpoint == 0)
    {
    strcpy(pj->ji_qs.ji_jobid,jid);

    strcpy(pj->ji_qs.ji_fileprefix,basename);

    pj->ji_modified       = 1;

    pj->ji_qs.ji_svrflags = created_here;

    pj->ji_qs.ji_un_type  = JOB_UNION_TYPE_NEW;
    }

  /* decode attributes from request into job structure */

  psatl = (svrattrl *)GET_NEXT(preq->rq_ind.rq_queuejob.rq_attr);

  while (psatl != NULL)
    {
    if (IsCheckpoint == 1)
      {
      if (strcmp(psatl->al_name,ATTR_checkpoint_name) &&
          strcmp(psatl->al_name,ATTR_v))
        {
        psatl = (svrattrl *)GET_NEXT(psatl->al_link);

        continue;
        }
      }

    /* identify the attribute by name */

    index = find_attr(job_attr_def,psatl->al_name,JOB_ATR_LAST);

    if (index < 0)
      {
      /* FAILURE */

      /* didn`t recognize the name */

      job_purge(pj);   /* CRI - 12/20/2004 */

      reply_badattr(PBSE_NOATTR,1,psatl,preq);

      return;
      }

    pdef = &job_attr_def[index];

    /* Is attribute not writeable by manager or by a server? */

    if ((pdef->at_flags & resc_access_perm) == 0)
      {
      /* FAILURE */

      job_purge(pj);

      reply_badattr(PBSE_ATTRRO,1,psatl,preq);

      return;
      }

    /* decode attribute */

    if (!strcmp(psatl->al_name,ATTR_v))
      {
      rc = decode_arst_merge(
             &pj->ji_wattr[index],
             psatl->al_name,
             psatl->al_resc,
             psatl->al_value);
      }
    else
      {
      rc = pdef->at_decode(
             &pj->ji_wattr[index],
             psatl->al_name,
             psatl->al_resc,
             psatl->al_value);
      }

    if (rc != 0)
      {
      /* FAILURE */

      /* all errors are fatal for MOM */

      job_purge(pj);

      reply_badattr(rc,1,psatl,preq);

      return;
      }

    if (psatl->al_op == DFLT)
      {
      if (psatl->al_resc)
        {
        resource     *presc;
        resource_def *prdef;

        prdef = find_resc_def(svr_resc_def,psatl->al_resc,svr_resc_size);

        if (prdef == NULL)
          {
          job_purge(pj);

          reply_badattr(rc,1,psatl, preq);

          return;
          }

        presc = find_resc_entry(&pj->ji_wattr[index],prdef);

        if (presc != NULL)
          presc->rs_value.at_flags |= ATR_VFLAG_DEFLT;
        }
      else
        {
        pj->ji_wattr[index].at_flags |= ATR_VFLAG_DEFLT;
        }
      }    /* END if (psatl->al_op == DFLT) */

    psatl = (svrattrl *)GET_NEXT(psatl->al_link);
    }      /* END while (psatl != NULL) */

  if (IsCheckpoint == 1)
    {
    pj->ji_qs.ji_substate = JOB_SUBSTATE_TRANSIN;

    if (reply_jobid(preq,pj->ji_qs.ji_jobid,BATCH_REPLY_CHOICE_Queue) == 0)
      {
      delete_link(&pj->ji_alljobs);

      append_link(&svr_newjobs,&pj->ji_alljobs,pj);

      pj->ji_qs.ji_un_type = JOB_UNION_TYPE_NEW;
      pj->ji_qs.ji_un.ji_newt.ji_fromsock = sock;
      pj->ji_qs.ji_un.ji_newt.ji_fromaddr = get_connectaddr(sock);
      pj->ji_qs.ji_un.ji_newt.ji_scriptsz = 0;

      /* Per Eric R., req_mvjobfile was giving error in open_std_file, 
         showed up as fishy error message */

      if (pj->ji_grpcache != NULL)
        {
        free(pj->ji_grpcache);
        pj->ji_grpcache = NULL;
        }
      }
    else
      {
      close_conn(sock);
      }

    /* SUCCESS */

    return;
    }

  /* set remaining job structure elements */

  pj->ji_qs.ji_state =    JOB_STATE_TRANSIT;

  pj->ji_qs.ji_substate = JOB_SUBSTATE_TRANSIN;

  pj->ji_wattr[(int)JOB_ATR_mtime].at_val.at_long = (long)time_now;

  pj->ji_wattr[(int)JOB_ATR_mtime].at_flags |= ATR_VFLAG_SET;

  pj->ji_qs.ji_un_type = JOB_UNION_TYPE_NEW;

  pj->ji_qs.ji_un.ji_newt.ji_fromsock = sock;

  pj->ji_qs.ji_un.ji_newt.ji_fromaddr = get_connectaddr(sock);

  pj->ji_qs.ji_un.ji_newt.ji_scriptsz = 0;

  /* acknowledge the request with the job id */

  if (reply_jobid(preq, pj->ji_qs.ji_jobid, BATCH_REPLY_CHOICE_Queue) != 0)
    {
    /* reply failed, purge the job and close the connection */

    close_conn(sock);

    job_purge(pj);

    return;
    }

  /* link job into server's new jobs list request  */

  append_link(&svr_newjobs, &pj->ji_alljobs, pj);

  return;
  }  /* END req_quejob() */
Exemplo n.º 29
0
job *job_recov(

  char *filename) /* I */   /* pathname to job save file */

  {
  int  fds;
  job *pj;
  char *pn;
  char  namebuf[MAXPATHLEN];
  int    qs_upgrade;
#ifndef PBS_MOM
  char   parent_id[PBS_MAXSVRJOBID + 1];
  job_array *pa;
#endif

  qs_upgrade = FALSE;

  pj = job_alloc(); /* allocate & initialize job structure space */

  if (pj == NULL)
    {
    /* FAILURE - cannot alloc memory */

    return(NULL);
    }

  strcpy(namebuf, path_jobs); /* job directory path */

  strcat(namebuf, filename);

  fds = open(namebuf, O_RDONLY, 0);

  if (fds < 0)
    {
    sprintf(log_buffer, "unable to open %s",
            namebuf);

    log_err(errno, "job_recov", log_buffer);

    free((char *)pj);

    /* FAILURE - cannot open job file */

    return(NULL);
    }

  /* read in job quick save sub-structure */

  if (read(fds, (char *)&pj->ji_qs, quicksize) != (ssize_t)quicksize &&
      pj->ji_qs.qs_version == PBS_QS_VERSION)
    {
    sprintf(log_buffer, "Unable to read %s",
            namebuf);

    log_err(errno, "job_recov", log_buffer);

    free((char *)pj);

    close(fds);

    return(NULL);
    }

  /* is ji_qs the version we expect? */

  if (pj->ji_qs.qs_version != PBS_QS_VERSION)
    {
    /* ji_qs is older version */
    sprintf(log_buffer,
            "%s appears to be from an old version. Attempting to convert.\n",
            namebuf);
    log_err(-1, "job_recov", log_buffer);

    if (job_qs_upgrade(pj, fds, namebuf, pj->ji_qs.qs_version) != 0)
      {
      sprintf(log_buffer, "unable to upgrade %s\n", namebuf);

      log_err(-1, "job_recov", log_buffer);

      free((char *)pj);

      close(fds);

      return(NULL);
      }

    qs_upgrade = TRUE;
    }  /* END if (pj->ji_qs.qs_version != PBS_QS_VERSION) */

  /* Does file name match the internal name? */
  /* This detects ghost files */

  pn = strrchr(namebuf, (int)'/') + 1;

  if (strncmp(pn, pj->ji_qs.ji_fileprefix, strlen(pj->ji_qs.ji_fileprefix)) != 0)
    {
    /* mismatch, discard job */

    sprintf(log_buffer, "Job Id %s does not match file name for %s",
            pj->ji_qs.ji_jobid,
            namebuf);

    log_err(-1, "job_recov", log_buffer);

    free((char *)pj);

    close(fds);

    return(NULL);
    }

  /* read in working attributes */

  if (recov_attr(
        fds,
        pj,
        job_attr_def,
        pj->ji_wattr,
        (int)JOB_ATR_LAST,
        (int)JOB_ATR_UNKN,
        TRUE) != 0)
    {
    sprintf(log_buffer, "unable to recover %s (file is likely corrupted)",
            namebuf);

    log_err(-1, "job_recov", log_buffer);

    job_free(pj);

    close(fds);

    return(NULL);
    }

#ifdef PBS_MOM
  /* read in tm sockets and ips */

  if (recov_tmsock(fds, pj) != 0)
    {
    sprintf(log_buffer, "warning: tmsockets not recovered from %s (written by an older pbs_mom?)",
            namebuf);

    log_err(-1, "job_recov", log_buffer);
    }

  if (recov_roottask(fds, pj) != 0)
    {
    sprintf(log_buffer, "warning: root task not recovered from %s (written by an older pbs_mom?)",
            namebuf);

    log_err(-1, "job_recov", log_buffer);
    }

  if (recov_jobflags(fds, pj) != 0)
    {
    sprintf(log_buffer, "warning: job flags not recovered from %s (written by an older pbs_mom?)", namebuf);

    log_err(-1, "job_recov", log_buffer);
    }

#else /* PBS_MOM */

  if (pj->ji_wattr[(int)JOB_ATR_job_array_request].at_flags & ATR_VFLAG_SET)
    {
    /* job is part of an array.  We need to put a link back to the server
    job array struct for this array. We also have to link this job into
    the linked list of jobs belonging to the array. */

    array_get_parent_id(pj->ji_qs.ji_jobid, parent_id);
    pa = get_array(parent_id);

    if (strcmp(parent_id, pj->ji_qs.ji_jobid) == 0)
      {
      pj->ji_isparent = TRUE;
      }
    else
      {
      if (pa == NULL)
        {
        /* couldn't find array struct, it must not have been recovered,
        treat job as indepentent job?  perhaps we should delete the job
        XXX_JOB_ARRAY: should I unset this?*/
        pj->ji_wattr[(int)JOB_ATR_job_array_request].at_flags &= ~ATR_VFLAG_SET;
        }
      else
        {
        CLEAR_LINK(pj->ji_arrayjobs);
        append_link(&pa->array_alljobs, &pj->ji_arrayjobs, (void*)pj);
        pj->ji_arraystruct = pa;
        pa->jobs_recovered++;
        }
      }
    }

#endif

  close(fds);

  /* all done recovering the job */

  if (qs_upgrade == TRUE)
    {
    job_save(pj, SAVEJOB_FULL);
    }

  return(pj);
  }  /* END job_recov() */
Exemplo n.º 30
0
/**
 * @brief
 * 		create_subjob - create a Subjob from the parent Array Job
 * 		Certain attributes are changed or left out
 * @param[in]	parent - pointer to parent Job
 * @param[in]	newjid -  new job id
 * @param[in]	rc -  return code
 * @return	pointer to new job
 * @retval  NULL	- error
 */
job *
create_subjob(job *parent, char *newjid, int *rc)
{
	pbs_list_head  attrl;
	int	   i;
	int	   j;
	int	   indx;
	char	  *index;
	attribute_def *pdef;
	attribute *ppar;
	attribute *psub;
	svrattrl  *psatl;
	job 	  *subj;
	long	   eligibletime;
	long	    time_msec;
#ifdef	WIN32
	struct	_timeb	    tval;
#else
	struct timeval	    tval;
#endif


	if ((parent->ji_qs.ji_svrflags & JOB_SVFLG_ArrayJob) == 0) {
		*rc = PBSE_IVALREQ;
		return NULL;	/* parent not an array job */
	}

	/* find and copy the index */

	if ((index = get_index_from_jid(newjid)) == NULL) {
		*rc = PBSE_IVALREQ;
		return NULL;
	}
	if ((indx = subjob_index_to_offset(parent, index)) == -1) {
		*rc = PBSE_UNKJOBID;
		return NULL;
	}
	if (parent->ji_ajtrk->tkm_tbl[indx].trk_status != JOB_STATE_QUEUED) {
		*rc = PBSE_BADSTATE;
		return NULL;
	}

	/*
	 * allocate and clear basic structure
	 * cannot copy job attributes because cannot share strings and other
	 * malloc-ed data,  so copy ji_qs as a whole and then copy the
	 * non-saved items before ji_qs.
	 */

	subj = job_alloc();
	subj->ji_qs = parent->ji_qs;	/* copy the fixed save area */

#ifdef PBS_CRED_GRIDPROXY
	subj->ji_gsscontext  = parent->ji_gsscontext;
#endif
	subj->ji_qhdr     = parent->ji_qhdr;
	subj->ji_resvp    = parent->ji_resvp;
	subj->ji_myResv   = parent->ji_myResv;
	subj->ji_parentaj = parent;
	strcpy(subj->ji_qs.ji_jobid, newjid);	/* replace job id */
	*subj->ji_qs.ji_fileprefix = '\0';
	subj->ji_subjindx = indx;

	/*
	 * now that is all done, copy the required attributes by
	 * encoding and then decoding into the new array.  Then add the
	 * subjob specific attributes.
	 */

	resc_access_perm = ATR_DFLAG_ACCESS;
	CLEAR_HEAD(attrl);
	for (i = 0; attrs_to_copy[i] != JOB_ATR_LAST; i++) {
		j    = (int)attrs_to_copy[i];
		ppar = &parent->ji_wattr[j];
		psub = &subj->ji_wattr[j];
		pdef = &job_attr_def[j];

		if (pdef->at_encode(ppar, &attrl, pdef->at_name, NULL,
			ATR_ENCODE_MOM, &psatl) > 0) {
			for (psatl = (svrattrl *)GET_NEXT(attrl); psatl;
				psatl = ((svrattrl *)GET_NEXT(psatl->al_link))) {
				pdef->at_decode(psub, psatl->al_name, psatl->al_resc,
					psatl->al_value);
			}
			/* carry forward the default bit if set */
			psub->at_flags |= (ppar->at_flags & ATR_VFLAG_DEFLT);
			free_attrlist(&attrl);
		}
	}

	psub = &subj->ji_wattr[(int)JOB_ATR_array_id];
	job_attr_def[(int)JOB_ATR_array_id].at_decode(psub, NULL, NULL,
		parent->ji_qs.ji_jobid);

	psub = &subj->ji_wattr[(int)JOB_ATR_array_index];
	job_attr_def[(int)JOB_ATR_array_index].at_decode(psub, NULL, NULL, index);

	/* Lastly, set or clear a few flags and link in the structure */

	subj->ji_qs.ji_svrflags &= ~JOB_SVFLG_ArrayJob;
	subj->ji_qs.ji_svrflags |=  JOB_SVFLG_SubJob;
	subj->ji_modified = 1;	/* ** will likely take this out ** */

	subj->ji_qs.ji_substate = JOB_SUBSTATE_TRANSICM;
	(void)svr_setjobstate(subj, JOB_STATE_QUEUED, JOB_SUBSTATE_QUEUED);
	subj->ji_wattr[(int)JOB_ATR_state].at_flags    |= ATR_VFLAG_SET;
	subj->ji_wattr[(int)JOB_ATR_substate].at_flags |= ATR_VFLAG_SET;

	/* subjob needs to borrow eligible time from parent job array.
	 * expecting only to accrue eligible_time and nothing else.
	 */
	if (server.sv_attr[(int)SRV_ATR_EligibleTimeEnable].at_val.at_long == 1) {

		eligibletime = parent->ji_wattr[(int)JOB_ATR_eligible_time].at_val.at_long;

		if (parent->ji_wattr[(int)JOB_ATR_accrue_type].at_val.at_long == JOB_ELIGIBLE)
			eligibletime += subj->ji_wattr[(int)JOB_ATR_sample_starttime].at_val.at_long - parent->ji_wattr[(int)JOB_ATR_sample_starttime].at_val.at_long;

		subj->ji_wattr[(int)JOB_ATR_eligible_time].at_val.at_long = eligibletime;
		subj->ji_wattr[(int)JOB_ATR_eligible_time].at_flags |= ATR_VFLAG_MODIFY | ATR_VFLAG_MODCACHE;

	}
#ifdef WIN32
	_ftime_s(&tval);
	time_msec = (tval.time * 1000L) + tval.millitm;
#else
	gettimeofday(&tval, NULL);
	time_msec = (tval.tv_sec * 1000L) + (tval.tv_usec/1000L);
#endif
	/* set the queue rank attribute */
	subj->ji_wattr[(int)JOB_ATR_qrank].at_val.at_long = time_msec;
	subj->ji_wattr[(int)JOB_ATR_qrank].at_flags |= ATR_VFLAG_SET|ATR_VFLAG_MODCACHE;
	if (svr_enquejob(subj) != 0) {
		job_purge(subj);
		*rc = PBSE_IVALREQ;
		return NULL;
	}
	*rc = PBSE_NONE;
	return subj;
}