Example #1
0
int job_recov_binary(

  char *filename,  /* I */   /* pathname to job save file */
  job  **pjob,     /* M */   /* pointer to a pointer of job structure to fill info */
  char *log_buf,   /* O */   /* buffer to hold error message */
  size_t buf_len)  /* I */   /* len of the error buffer */

  {
  int  fds;
  job  *pj = *pjob;
  char *pn;

#ifdef PBS_MOM
  char fileid[MAXPATHLEN];
#endif

  fds = open(filename, O_RDONLY, 0);

  if (fds < 0)
    {
    snprintf(log_buf, buf_len, "unable to open %s", filename);
    return -1;
    }

  /* read in job quick save sub-structure */

  if (read_ac_socket(fds, (char *)&pj->ji_qs, sizeof(pj->ji_qs)) != sizeof(pj->ji_qs) &&
      pj->ji_qs.qs_version == PBS_QS_VERSION)
    {
    snprintf(log_buf, buf_len, "Unable to read %s", filename);
    close(fds);
    return -1;
    }

  /* is ji_qs the version we expect? */
  if (pj->ji_qs.qs_version != PBS_QS_VERSION)
    {
    /* ji_qs is older version */
    snprintf(log_buf, buf_len,
      "%s appears to be from an old version. Attempting to convert.\n",
      filename);
    log_err(-1, __func__, log_buf);

    if (job_qs_upgrade(pj, fds, filename, pj->ji_qs.qs_version) != 0)
      {
      snprintf(log_buf, buf_len, "unable to upgrade %s\n", filename);
      close(fds);
      return -1;
      }
    }  /* END if (pj->ji_qs.qs_version != PBS_QS_VERSION) */

  /* Does file name match the internal name? */
  /* This detects ghost files */

  pn = strrchr(filename, (int)'/') + 1;

#ifndef PBS_MOM
  if (strncmp(pn, pj->ji_qs.ji_fileprefix, strlen(pj->ji_qs.ji_fileprefix)) != 0)
#else
  if(multi_mom != 0)
    {
    sprintf(fileid,"%s%d",pj->ji_qs.ji_fileprefix,pbs_rm_port);
    }
  else
    {
    strcpy(fileid,pj->ji_qs.ji_fileprefix);
    }
  if (strncmp(pn, fileid, strlen(fileid)) != 0)
#endif
    {
    /* mismatch, discard job */

    snprintf(log_buf, buf_len, "Job Id %s does not match file name for %s",
      pj->ji_qs.ji_jobid,
      filename);

    close(fds);
    return -1;
    }

  /* read in working attributes */

  if (recov_attr(
        fds,
        pj,
        job_attr_def,
        pj->ji_wattr,
        JOB_ATR_LAST,
        JOB_ATR_UNKN,
        TRUE) != 0) 
    {
    snprintf(log_buf, buf_len, "unable to recover %s (file is likely corrupted)", filename);
    close(fds);
    return -1;
    }

#ifdef PBS_MOM
  /* read in tm sockets and ips */

  if (recov_tmsock(fds, pj) != 0)
    {
    snprintf(log_buf, buf_len,
        "warning: tmsockets not recovered from %s (written by an older pbs_mom?)",
        filename);

    log_err(-1, __func__, log_buf);
    }
#endif /* PBS_MOM */

  close(fds);

  return PBSE_NONE;
  }  /* END job_recov_binary() */
Example #2
0
pbs_queue *que_recov(

  char *filename) /* pathname to queue save file */

  {
  int        fds;
  int        i;
  pbs_queue *pq;
  char       namebuf[MAXPATHLEN];
  time_t     time_now = time(NULL);

  pq = que_alloc(filename, TRUE);  /* allocate & init queue structure space */

  if (pq == NULL)
    {
    log_err(-1, __func__, "que_alloc failed");

    return(NULL);
    }

  snprintf(namebuf, sizeof(namebuf), "%s%s", path_queues, filename);

  fds = open(namebuf, O_RDONLY, 0);

  if (fds < 0)
    {
    log_err(errno, __func__, "open error");

    que_free(pq, TRUE);

    return(NULL);
    }

  /* read in queue save sub-structure */

  if (read_ac_socket(fds, (char *)&pq->qu_qs, sizeof(queuefix)) !=
      sizeof(queuefix))
    {
    log_err(errno, __func__, "read error");
    que_free(pq, TRUE);
    close(fds);
    return ((pbs_queue *)0);
    }

  /* read in queue attributes */

  if (recov_attr(fds, pq, que_attr_def, pq->qu_attr,
	               QA_ATR_LAST, 0, TRUE) != 0)
    {
    log_err(-1, __func__, "recov_attr[common] failed");
    que_free(pq, TRUE);
    close(fds);
    return ((pbs_queue *)0);
    }

  /*
   * now reload the access control lists, these attributes were
   * saved separately
   */

  for (i = 0;i < QA_ATR_LAST;i++)
    {
    if (pq->qu_attr[i].at_type == ATR_TYPE_ACL)
      {
      recov_acl(
        &pq->qu_attr[i],
        &que_attr_def[i],
        que_attr_def[i].at_name,
        pq->qu_qs.qu_name);
      }
    }

  /* all done recovering the queue */

  close(fds);

  if ((pq->qu_attr[QA_ATR_MTime].at_flags & ATR_VFLAG_SET) == 0)
    {
    /* if we are recovering a pre-2.1.2 queue, save a new mtime */

    pq->qu_attr[QA_ATR_MTime].at_val.at_long = time_now;
    pq->qu_attr[QA_ATR_MTime].at_flags = ATR_VFLAG_SET;

    que_save(pq);
    }

  return(pq);
  }
Example #3
0
job *job_recov(

  char *filename) /* I */   /* pathname to job save file */

  {
  int  fds;
  job *pj;
  char *pn;
  char  namebuf[MAXPATHLEN];
  int    qs_upgrade;
#ifndef PBS_MOM
  char   parent_id[PBS_MAXSVRJOBID + 1];
  job_array *pa;
#endif

  qs_upgrade = FALSE;

  pj = job_alloc(); /* allocate & initialize job structure space */

  if (pj == NULL)
    {
    /* FAILURE - cannot alloc memory */

    return(NULL);
    }

  strcpy(namebuf, path_jobs); /* job directory path */

  strcat(namebuf, filename);

  fds = open(namebuf, O_RDONLY, 0);

  if (fds < 0)
    {
    sprintf(log_buffer, "unable to open %s",
            namebuf);

    log_err(errno, "job_recov", log_buffer);

    free((char *)pj);

    /* FAILURE - cannot open job file */

    return(NULL);
    }

  /* read in job quick save sub-structure */

  if (read(fds, (char *)&pj->ji_qs, quicksize) != (ssize_t)quicksize &&
      pj->ji_qs.qs_version == PBS_QS_VERSION)
    {
    sprintf(log_buffer, "Unable to read %s",
            namebuf);

    log_err(errno, "job_recov", log_buffer);

    free((char *)pj);

    close(fds);

    return(NULL);
    }

  /* is ji_qs the version we expect? */

  if (pj->ji_qs.qs_version != PBS_QS_VERSION)
    {
    /* ji_qs is older version */
    sprintf(log_buffer,
            "%s appears to be from an old version. Attempting to convert.\n",
            namebuf);
    log_err(-1, "job_recov", log_buffer);

    if (job_qs_upgrade(pj, fds, namebuf, pj->ji_qs.qs_version) != 0)
      {
      sprintf(log_buffer, "unable to upgrade %s\n", namebuf);

      log_err(-1, "job_recov", log_buffer);

      free((char *)pj);

      close(fds);

      return(NULL);
      }

    qs_upgrade = TRUE;
    }  /* END if (pj->ji_qs.qs_version != PBS_QS_VERSION) */

  /* Does file name match the internal name? */
  /* This detects ghost files */

  pn = strrchr(namebuf, (int)'/') + 1;

  if (strncmp(pn, pj->ji_qs.ji_fileprefix, strlen(pj->ji_qs.ji_fileprefix)) != 0)
    {
    /* mismatch, discard job */

    sprintf(log_buffer, "Job Id %s does not match file name for %s",
            pj->ji_qs.ji_jobid,
            namebuf);

    log_err(-1, "job_recov", log_buffer);

    free((char *)pj);

    close(fds);

    return(NULL);
    }

  /* read in working attributes */

  if (recov_attr(
        fds,
        pj,
        job_attr_def,
        pj->ji_wattr,
        (int)JOB_ATR_LAST,
        (int)JOB_ATR_UNKN,
        TRUE) != 0)
    {
    sprintf(log_buffer, "unable to recover %s (file is likely corrupted)",
            namebuf);

    log_err(-1, "job_recov", log_buffer);

    job_free(pj);

    close(fds);

    return(NULL);
    }

#ifdef PBS_MOM
  /* read in tm sockets and ips */

  if (recov_tmsock(fds, pj) != 0)
    {
    sprintf(log_buffer, "warning: tmsockets not recovered from %s (written by an older pbs_mom?)",
            namebuf);

    log_err(-1, "job_recov", log_buffer);
    }

  if (recov_roottask(fds, pj) != 0)
    {
    sprintf(log_buffer, "warning: root task not recovered from %s (written by an older pbs_mom?)",
            namebuf);

    log_err(-1, "job_recov", log_buffer);
    }

  if (recov_jobflags(fds, pj) != 0)
    {
    sprintf(log_buffer, "warning: job flags not recovered from %s (written by an older pbs_mom?)", namebuf);

    log_err(-1, "job_recov", log_buffer);
    }

#else /* PBS_MOM */

  if (pj->ji_wattr[(int)JOB_ATR_job_array_request].at_flags & ATR_VFLAG_SET)
    {
    /* job is part of an array.  We need to put a link back to the server
    job array struct for this array. We also have to link this job into
    the linked list of jobs belonging to the array. */

    array_get_parent_id(pj->ji_qs.ji_jobid, parent_id);
    pa = get_array(parent_id);

    if (strcmp(parent_id, pj->ji_qs.ji_jobid) == 0)
      {
      pj->ji_isparent = TRUE;
      }
    else
      {
      if (pa == NULL)
        {
        /* couldn't find array struct, it must not have been recovered,
        treat job as indepentent job?  perhaps we should delete the job
        XXX_JOB_ARRAY: should I unset this?*/
        pj->ji_wattr[(int)JOB_ATR_job_array_request].at_flags &= ~ATR_VFLAG_SET;
        }
      else
        {
        CLEAR_LINK(pj->ji_arrayjobs);
        append_link(&pa->array_alljobs, &pj->ji_arrayjobs, (void*)pj);
        pj->ji_arraystruct = pa;
        pa->jobs_recovered++;
        }
      }
    }

#endif

  close(fds);

  /* all done recovering the job */

  if (qs_upgrade == TRUE)
    {
    job_save(pj, SAVEJOB_FULL);
    }

  return(pj);
  }  /* END job_recov() */
Example #4
0
int svr_recov(

  char *svrfile,  /* I */
  int read_only)  /* I */

  {
  int  i;
  int  sdb;
  char log_buf[LOCAL_LOG_BUF_SIZE];

  void recov_acl(pbs_attribute *, attribute_def *, const char *, const char *);

  sdb = open(svrfile, O_RDONLY, 0);

  if (sdb < 0)
    {
    if (errno == ENOENT)
      {
      char tmpLine[LOG_BUF_SIZE];

      snprintf(tmpLine, sizeof(tmpLine), "cannot locate server database '%s' - use 'pbs_server -t create' to create new database if database has not been initialized.",
               svrfile);

      log_err(errno, __func__, tmpLine);
      }
    else
      {
      log_err(errno, __func__, msg_svdbopen);
      }

    return(-1);
    }

  /* read in server structure */
  lock_sv_qs_mutex(server.sv_qs_mutex, __func__);

  i = read_ac_socket(sdb, (char *) & server.sv_qs, sizeof(server_qs));

  if (i != sizeof(server_qs))
    {
    unlock_sv_qs_mutex(server.sv_qs_mutex, log_buf);

    if (i < 0)
      log_err(errno, __func__, "read of serverdb failed");
    else
      log_err(errno, __func__, "short read of serverdb");

    close(sdb);

    return(-1);
    }

  /* Save the sv_jobidnumber field in case it is set by the attributes. */
  i = server.sv_qs.sv_jobidnumber;

  /* read in server attributes */

  if (recov_attr(
        sdb,
        &server,
        svr_attr_def,
        server.sv_attr,
        SRV_ATR_LAST,
        0,
        !read_only) != 0 ) 
    {
    unlock_sv_qs_mutex(server.sv_qs_mutex, log_buf);
    log_err(errno, __func__, "error on recovering server attr");

    close(sdb);

    return(-1);
    }

  /* Restore the current job number and make it visible in qmgr print server commnad. */

  if (!read_only)
    {
    server.sv_qs.sv_jobidnumber = i;

    server.sv_attr[SRV_ATR_NextJobNumber].at_val.at_long = i;

    server.sv_attr[SRV_ATR_NextJobNumber].at_flags |= ATR_VFLAG_SET| ATR_VFLAG_MODIFY;
    }

  unlock_sv_qs_mutex(server.sv_qs_mutex, __func__);

  close(sdb);

  /* recover the server various acls from their own files */

  for (i = 0;i < SRV_ATR_LAST;i++)
    {
    if (server.sv_attr[i].at_type == ATR_TYPE_ACL)
      {
      recov_acl(
        &server.sv_attr[i],
        &svr_attr_def[i],
        PBS_SVRACL,
        svr_attr_def[i].at_name);

      if ((!read_only) && (svr_attr_def[i].at_action != (int (*)(pbs_attribute*, void*, int))0))
        {
        svr_attr_def[i].at_action(
          &server.sv_attr[i],
          &server,
          ATR_ACTION_RECOV);
        }
      }
    }    /* END for (i) */

  return(PBSE_NONE);
  }  /* END svr_recov() */
Example #5
0
int svr_recov(

  char *svrfile,  /* I */
  int read_only)  /* I */

  {
  static char *id = "svr_recov";
  int i;
  int sdb;

  void recov_acl(attribute *, attribute_def *, char *, char *);

  sdb = open(svrfile, O_RDONLY, 0);

  if (sdb < 0)
    {
    if (errno == ENOENT)
      {
      char tmpLine[LOG_BUF_SIZE];

      snprintf(tmpLine, sizeof(tmpLine), "cannot locate server database '%s' - use 'pbs_server -t create' to create new database if database has not been initialized.",
               svrfile);

      log_err(errno, id, tmpLine);
      }
    else
      {
      log_err(errno, id, msg_svdbopen);
      }

    return(-1);
    }

  /* read in server structure */

  i = read(sdb, (char *) & server.sv_qs, sizeof(struct server_qs));

  if (i != sizeof(struct server_qs))
    {
    if (i < 0)
      log_err(errno, id, "read of serverdb failed");
    else
      log_err(errno, id, "short read of serverdb");

    close(sdb);

    return(-1);
    }

  if (strstr((char *)&server.sv_qs,"<server_db>") != NULL)
    {
    /* attempt to read the server database in xml */
    log_event(PBSEVENT_SYSTEM,
      PBS_EVENTCLASS_SERVER,
      id,
      "serverdb appears to be in xml format, attempting to read xml\n");

    return(svr_recov_xml(svrfile,read_only));
    }

  /* Save the sv_jobidnumber field in case it is set by the attributes. */
  i = server.sv_qs.sv_jobidnumber;

  /* read in server attributes */

  if (recov_attr(
        sdb,
        &server,
        svr_attr_def,
        server.sv_attr,
        (int)SRV_ATR_LAST,
        0,
        !read_only) != 0 ) 
    {
    log_err(errno, id, "error on recovering server attr");

    close(sdb);

    return(-1);
    }

  /* Restore the current job number and make it visible in qmgr print server commnad. */

  if (!read_only)
    {
    server.sv_qs.sv_jobidnumber = i;

    server.sv_attr[(int)SRV_ATR_NextJobNumber].at_val.at_long = i;

    server.sv_attr[(int)SRV_ATR_NextJobNumber].at_flags |= ATR_VFLAG_SET| ATR_VFLAG_MODIFY;
    }

  close(sdb);

  /* recover the server various acls from their own files */

  for (i = 0;i < SRV_ATR_LAST;i++)
    {
    if (server.sv_attr[i].at_type == ATR_TYPE_ACL)
      {
      recov_acl(
        &server.sv_attr[i],
        &svr_attr_def[i],
        PBS_SVRACL,
        svr_attr_def[i].at_name);

      if ((!read_only) && (svr_attr_def[i].at_action != (int (*)())0))
        {
        svr_attr_def[i].at_action(
          &server.sv_attr[i],
          &server,
          ATR_ACTION_RECOV);
        }
      }
    }    /* END for (i) */

  return(0);
  }  /* END svr_recov() */
Example #6
0
job *job_recov(

  char *filename) /* I */   /* pathname to job save file */

  {
  int  fds;
  job  *pj;
  char *pn;
  char  namebuf[MAXPATHLEN];
  char  log_buf[LOCAL_LOG_BUF_SIZE];

#ifndef PBS_MOM
  char       parent_id[PBS_MAXSVRJOBID + 1];
  job_array *pa;
#endif

  pj = job_alloc(); /* allocate & initialize job structure space */

  if (pj == NULL)
    {
    /* FAILURE - cannot alloc memory */

    return(NULL);
    }

  snprintf(namebuf, MAXPATHLEN, "%s%s", path_jobs, filename); /* job directory path, filename */

  fds = open(namebuf, O_RDONLY, 0);

  if (fds < 0)
    {
    snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "unable to open %s", namebuf);

    log_err(errno, __func__, log_buf);

#ifndef PBS_MOM
    unlock_ji_mutex(pj, __func__, "1", LOGLEVEL);
    free(pj->ji_mutex);
#endif

    free((char *)pj);

    /* FAILURE - cannot open job file */

    return(NULL);
    }

  /* read in job quick save sub-structure */

  if (read_ac_socket(fds, (char *)&pj->ji_qs, sizeof(pj->ji_qs)) != sizeof(pj->ji_qs) &&
      pj->ji_qs.qs_version == PBS_QS_VERSION)
    {
    snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "Unable to read %s", namebuf);

    log_err(errno, __func__, log_buf);

#ifndef PBS_MOM
    unlock_ji_mutex(pj, __func__, "2", LOGLEVEL);
    free(pj->ji_mutex);
#endif

    free((char *)pj);

    close(fds);

    return(NULL);
    }

  /* is ji_qs the version we expect? */

  if (pj->ji_qs.qs_version != PBS_QS_VERSION)
    {
    /* ji_qs is older version */
    snprintf(log_buf, LOCAL_LOG_BUF_SIZE,
      "%s appears to be from an old version. Attempting to convert.\n",
      namebuf);

    log_err(-1, __func__, log_buf);

    if (job_qs_upgrade(pj, fds, namebuf, pj->ji_qs.qs_version) != 0)
      {
      snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "unable to upgrade %s\n", namebuf);

      log_err(-1, __func__, log_buf);

#ifndef PBS_MOM
      unlock_ji_mutex(pj, __func__, "3", LOGLEVEL);
      free(pj->ji_mutex);
#endif

      free((char *)pj);

      close(fds);

      return(NULL);
      }

    }  /* END if (pj->ji_qs.qs_version != PBS_QS_VERSION) */

  /* Does file name match the internal name? */
  /* This detects ghost files */

  pn = strrchr(namebuf, (int)'/') + 1;

  if (strncmp(pn, pj->ji_qs.ji_fileprefix, strlen(pj->ji_qs.ji_fileprefix)) != 0)
    {
    /* mismatch, discard job */

    snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "Job Id %s does not match file name for %s",
      pj->ji_qs.ji_jobid,
      namebuf);

    log_err(-1, __func__, log_buf);

#ifndef PBS_MOM
    unlock_ji_mutex(pj, __func__, "4", LOGLEVEL);
    free(pj->ji_mutex);
#endif

    free((char *)pj);

    close(fds);

    return(NULL);
    }

  /* read in working attributes */

  if (recov_attr(
        fds,
        pj,
        job_attr_def,
        pj->ji_wattr,
        JOB_ATR_LAST,
        JOB_ATR_UNKN,
        TRUE) != 0) 
    {
    snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "unable to recover %s (file is likely corrupted)", namebuf);

    log_err(-1, __func__, log_buf);

#ifndef PBS_MOM
    unlock_ji_mutex(pj, __func__, "5", LOGLEVEL);
    job_free(pj, FALSE);
#else
    mom_job_free(pj);
#endif


    close(fds);

    return(NULL);
    }

#ifndef PBS_MOM
  /* Comment out the mother superior tracking. Will be debugged later 
  if (pj->ji_wattr[JOB_ATR_exec_host].at_val.at_str != NULL)
    {*/
    /* add job to the mother superior list for it's node */
/*    char *ms = strdup(pj->ji_wattr[JOB_ATR_exec_host].at_val.at_str);
    char *end = strchr(ms, '/');

    if (end != NULL)
      *end = '\0';

    if ((end = strchr(ms, '+')) != NULL)
      *end = '\0';

    add_to_ms_list(ms, pj);

    free(ms);
    }*/
#endif

#ifdef PBS_MOM
  /* read in tm sockets and ips */

  if (recov_tmsock(fds, pj) != 0)
    {
    snprintf(log_buf, LOCAL_LOG_BUF_SIZE,
        "warning: tmsockets not recovered from %s (written by an older pbs_mom?)",
        namebuf);

    log_err(-1, __func__, log_buf);
    }

#else /* not PBS_MOM */

  if (strchr(pj->ji_qs.ji_jobid, '[') != NULL)
    {
    /* job is part of an array.  We need to put a link back to the server
    job array struct for this array. We also have to link this job into
    the linked list of jobs belonging to the array. */

    array_get_parent_id(pj->ji_qs.ji_jobid, parent_id);
    pa = get_array(parent_id);
    if (pa == NULL)
      {   
      job_abt(&pj, (char *)"Array job missing array struct, aborting job");
      close(fds);
      return NULL;
      }

    strcpy(pj->ji_arraystructid, parent_id);

    if (strcmp(parent_id, pj->ji_qs.ji_jobid) == 0)
      {
      pj->ji_is_array_template = TRUE;
      }
    else
      {
      pa->job_ids[(int)pj->ji_wattr[JOB_ATR_job_array_id].at_val.at_long] = strdup(pj->ji_qs.ji_jobid);
      pa->jobs_recovered++;

      /* This is a bit of a kluge, but for some reason if an array job was 
         on hold when the server went down the ji_wattr[JOB_ATR_hold].at_val.at_long
         value is 0 on recovery even though pj->ji_qs.ji_state is JOB_STATE_HELD and
         the substate is JOB_SUBSTATE_HELD
      */
      if ((pj->ji_qs.ji_state == JOB_STATE_HELD) &&
          (pj->ji_qs.ji_substate == JOB_SUBSTATE_HELD))
        {
        pj->ji_wattr[JOB_ATR_hold].at_val.at_long = HOLD_l;
        pj->ji_wattr[JOB_ATR_hold].at_flags = ATR_VFLAG_SET;
        }
      }

    if (pa != NULL)
      {
      unlock_ai_mutex(pa, __func__, "1", LOGLEVEL);
      }
    }

#endif

  close(fds);

  pj->ji_commit_done = 1;

  /* all done recovering the job */

  job_save(pj, SAVEJOB_FULL, 0);

  return(pj);
  }  /* END job_recov() */
Example #7
0
job *job_recov(

  char *filename) /* I */   /* pathname to job save file */

  {
  int  fds;
  job *pj;
  char *pn;
  char  namebuf[MAXPATHLEN];
  int    qs_upgrade;
#ifndef PBS_MOM
  char   parent_id[PBS_MAXSVRJOBID + 1];
  job_array *pa;
#endif

  qs_upgrade = FALSE;

  pj = job_alloc(); /* allocate & initialize job structure space */

  if (pj == NULL)
    {
    /* FAILURE - cannot alloc memory */

    return(NULL);
    }

  strcpy(namebuf, path_jobs); /* job directory path */

  strcat(namebuf, filename);

  fds = open(namebuf, O_RDONLY, 0);

  if (fds < 0)
    {
    sprintf(log_buffer, "unable to open %s",
            namebuf);

    log_err(errno, "job_recov", log_buffer);

    free((char *)pj);

    /* FAILURE - cannot open job file */

    return(NULL);
    }

  /* read in job quick save sub-structure */

  if (read(fds, (char *)&pj->ji_qs, quicksize) != (ssize_t)quicksize &&
      pj->ji_qs.qs_version == PBS_QS_VERSION)
    {
    sprintf(log_buffer, "Unable to read %s",
            namebuf);

    log_err(errno, "job_recov", log_buffer);

    free((char *)pj);

    close(fds);

    return(NULL);
    }

  /* is ji_qs the version we expect? */

  if (pj->ji_qs.qs_version != PBS_QS_VERSION)
    {
    /* ji_qs is older version */
    sprintf(log_buffer,
            "%s appears to be from an old version. Attempting to convert.\n",
            namebuf);
    log_err(-1, "job_recov", log_buffer);

    if (job_qs_upgrade(pj, fds, namebuf, pj->ji_qs.qs_version) != 0)
      {
      sprintf(log_buffer, "unable to upgrade %s\n", namebuf);

      log_err(-1, "job_recov", log_buffer);

      free((char *)pj);

      close(fds);

      return(NULL);
      }

    qs_upgrade = TRUE;
    }  /* END if (pj->ji_qs.qs_version != PBS_QS_VERSION) */

  /* Does file name match the internal name? */
  /* This detects ghost files */

  pn = strrchr(namebuf, (int)'/') + 1;

  if (strncmp(pn, pj->ji_qs.ji_fileprefix, strlen(pj->ji_qs.ji_fileprefix)) != 0)
    {
    /* mismatch, discard job */

    sprintf(log_buffer, "Job Id %s does not match file name for %s",
            pj->ji_qs.ji_jobid,
            namebuf);

    log_err(-1, "job_recov", log_buffer);

    free((char *)pj);

    close(fds);

    return(NULL);
    }

  /* read in working attributes */

  if (recov_attr(
        fds,
        pj,
        job_attr_def,
        pj->ji_wattr,
        (int)JOB_ATR_LAST,
        (int)JOB_ATR_UNKN,
        TRUE) != 0) 
    {
    sprintf(log_buffer, "unable to recover %s (file is likely corrupted)",
            namebuf);

    log_err(-1, "job_recov", log_buffer);

    job_free(pj);

    close(fds);

    return(NULL);
    }

#ifdef PBS_MOM
  /* read in tm sockets and ips */

  if (recov_tmsock(fds, pj) != 0)
    {
    sprintf(log_buffer, "warning: tmsockets not recovered from %s (written by an older pbs_mom?)",
            namebuf);

    log_err(-1, "job_recov", log_buffer);
    }

#else /* PBS_MOM */

  if (pj->ji_wattr[(int)JOB_ATR_job_array_request].at_flags & ATR_VFLAG_SET)
    {
    /* job is part of an array.  We need to put a link back to the server
    job array struct for this array. We also have to link this job into
    the linked list of jobs belonging to the array. */

    array_get_parent_id(pj->ji_qs.ji_jobid, parent_id);
    pa = get_array(parent_id);
    if (pa == NULL)
      {   
      job_abt(&pj, "Array job missing array struct, aborting job");
      close(fds);
      return NULL;
      }

    if (strcmp(parent_id, pj->ji_qs.ji_jobid) == 0)
      {
      pj->ji_is_array_template = TRUE;
      pj->ji_arraystruct = pa;
      }
    else
      {
      pa->jobs[(int)pj->ji_wattr[JOB_ATR_job_array_id].at_val.at_long] = (void *)pj;
      pj->ji_arraystruct = pa; 
      pa->jobs_recovered++;

      /* This is a bit of a kluge, but for some reason if an array job was 
         on hold when the server went down the ji_wattr[JOB_ATR_hold].at_val.at_long
         value is 0 on recovery even though pj->ji_qs.ji_state is JOB_STATE_HELD and
         the substate is JOB_SUBSTATE_HELD
      */
      if ((pj->ji_qs.ji_state == JOB_STATE_HELD) && (pj->ji_qs.ji_substate == JOB_SUBSTATE_HELD))
        {
        pj->ji_wattr[JOB_ATR_hold].at_val.at_long = HOLD_l;
        pj->ji_wattr[JOB_ATR_hold].at_flags = ATR_VFLAG_SET;
        }
      }
    }

#endif

  close(fds);

  /* all done recovering the job */

  job_save(pj, SAVEJOB_FULL);

  return(pj);
  }  /* END job_recov() */