int set_array_job_ids( job **pjob, /* M */ char *log_buf, /* error Buffer */ size_t buflen) /* error buffer length */ { int rc = PBSE_NONE; #ifndef PBS_MOM job *pj = *pjob; job_array *pa; char parent_id[PBS_MAXSVRJOBID + 1]; if (strchr(pj->ji_qs.ji_jobid, '[') != NULL) { /* job is part of an array. We need to put a link back to the server job array struct for this array. We also have to link this job into the linked list of jobs belonging to the array. */ array_get_parent_id(pj->ji_qs.ji_jobid, parent_id); pa = get_array(parent_id); if (pa == NULL) { job_abt(&pj, (char *)"Array job missing array struct, aborting job"); snprintf(log_buf, buflen, "Array job missing array struct %s", __func__); return -1; } strcpy(pj->ji_arraystructid, parent_id); if (strcmp(parent_id, pj->ji_qs.ji_jobid) == 0) { pj->ji_is_array_template = TRUE; } else { pa->job_ids[(int)pj->ji_wattr[JOB_ATR_job_array_id].at_val.at_long] = strdup(pj->ji_qs.ji_jobid); pa->jobs_recovered++; /* This is a bit of a kluge, but for some reason if an array job was on hold when the server went down the ji_wattr[JOB_ATR_hold].at_val.at_long value is 0 on recovery even though pj->ji_qs.ji_state is JOB_STATE_HELD and the substate is JOB_SUBSTATE_HELD */ if ((pj->ji_qs.ji_state == JOB_STATE_HELD) && (pj->ji_qs.ji_substate == JOB_SUBSTATE_HELD)) { pj->ji_wattr[JOB_ATR_hold].at_val.at_long = HOLD_l; pj->ji_wattr[JOB_ATR_hold].at_flags = ATR_VFLAG_SET; } } if (pa != NULL) { unlock_ai_mutex(pa, __func__, "1", LOGLEVEL); } } #endif /* !PBS_MOM */ return rc; }
int upgrade_2_2_X( job *pj, /* I */ int fds) /* I */ { ji_qs_2_2_X qs_old; #ifndef PBS_MOM char range[PBS_MAXJOBARRAYLEN*2+2]; char parent_id[PBS_MAXSVRJOBID + 1]; attribute tempattr; job_array *pa; #endif if (read(fds, (char*)&qs_old, sizeof(qs_old)) != sizeof(qs_old)) { return (-1); } pj->ji_qs.qs_version = PBS_QS_VERSION; pj->ji_qs.ji_state = qs_old.ji_state; pj->ji_qs.ji_substate = qs_old.ji_substate; pj->ji_qs.ji_svrflags = qs_old.ji_svrflags; pj->ji_qs.ji_numattr = qs_old.ji_numattr; pj->ji_qs.ji_ordering = qs_old.ji_ordering; pj->ji_qs.ji_priority = qs_old.ji_priority; pj->ji_qs.ji_stime = qs_old.ji_stime; strcpy(pj->ji_qs.ji_jobid, qs_old.ji_jobid); strcpy(pj->ji_qs.ji_fileprefix, qs_old.ji_fileprefix); strcpy(pj->ji_qs.ji_queue, qs_old.ji_queue); strcpy(pj->ji_qs.ji_destin, qs_old.ji_destin); pj->ji_qs.ji_un_type = qs_old.ji_un_type; memcpy(&pj->ji_qs.ji_un, &qs_old.ji_un, sizeof(qs_old.ji_un)); #ifndef PBS_MOM array_get_parent_id(pj->ji_qs.ji_jobid, parent_id); if (is_array(parent_id)) { pa = get_array(parent_id); if (pa == NULL) { return(1); } sprintf(range, "0-%d", pa->ai_qs.array_size - 1); clear_attr(&tempattr, &job_attr_def[JOB_ATR_job_array_request]); job_attr_def[JOB_ATR_job_array_request].at_decode( &tempattr, NULL, NULL, range); job_attr_def[JOB_ATR_job_array_request].at_set( &(pj->ji_wattr[JOB_ATR_job_array_request]), &tempattr, SET); } #endif return (0); }
job *job_recov( char *filename) /* I */ /* pathname to job save file */ { int fds; job *pj; char *pn; char namebuf[MAXPATHLEN]; char log_buf[LOCAL_LOG_BUF_SIZE]; #ifndef PBS_MOM char parent_id[PBS_MAXSVRJOBID + 1]; job_array *pa; #endif pj = job_alloc(); /* allocate & initialize job structure space */ if (pj == NULL) { /* FAILURE - cannot alloc memory */ return(NULL); } snprintf(namebuf, MAXPATHLEN, "%s%s", path_jobs, filename); /* job directory path, filename */ fds = open(namebuf, O_RDONLY, 0); if (fds < 0) { snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "unable to open %s", namebuf); log_err(errno, __func__, log_buf); #ifndef PBS_MOM unlock_ji_mutex(pj, __func__, "1", LOGLEVEL); free(pj->ji_mutex); #endif free((char *)pj); /* FAILURE - cannot open job file */ return(NULL); } /* read in job quick save sub-structure */ if (read_ac_socket(fds, (char *)&pj->ji_qs, sizeof(pj->ji_qs)) != sizeof(pj->ji_qs) && pj->ji_qs.qs_version == PBS_QS_VERSION) { snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "Unable to read %s", namebuf); log_err(errno, __func__, log_buf); #ifndef PBS_MOM unlock_ji_mutex(pj, __func__, "2", LOGLEVEL); free(pj->ji_mutex); #endif free((char *)pj); close(fds); return(NULL); } /* is ji_qs the version we expect? */ if (pj->ji_qs.qs_version != PBS_QS_VERSION) { /* ji_qs is older version */ snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "%s appears to be from an old version. Attempting to convert.\n", namebuf); log_err(-1, __func__, log_buf); if (job_qs_upgrade(pj, fds, namebuf, pj->ji_qs.qs_version) != 0) { snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "unable to upgrade %s\n", namebuf); log_err(-1, __func__, log_buf); #ifndef PBS_MOM unlock_ji_mutex(pj, __func__, "3", LOGLEVEL); free(pj->ji_mutex); #endif free((char *)pj); close(fds); return(NULL); } } /* END if (pj->ji_qs.qs_version != PBS_QS_VERSION) */ /* Does file name match the internal name? */ /* This detects ghost files */ pn = strrchr(namebuf, (int)'/') + 1; if (strncmp(pn, pj->ji_qs.ji_fileprefix, strlen(pj->ji_qs.ji_fileprefix)) != 0) { /* mismatch, discard job */ snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "Job Id %s does not match file name for %s", pj->ji_qs.ji_jobid, namebuf); log_err(-1, __func__, log_buf); #ifndef PBS_MOM unlock_ji_mutex(pj, __func__, "4", LOGLEVEL); free(pj->ji_mutex); #endif free((char *)pj); close(fds); return(NULL); } /* read in working attributes */ if (recov_attr( fds, pj, job_attr_def, pj->ji_wattr, JOB_ATR_LAST, JOB_ATR_UNKN, TRUE) != 0) { snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "unable to recover %s (file is likely corrupted)", namebuf); log_err(-1, __func__, log_buf); #ifndef PBS_MOM unlock_ji_mutex(pj, __func__, "5", LOGLEVEL); job_free(pj, FALSE); #else mom_job_free(pj); #endif close(fds); return(NULL); } #ifndef PBS_MOM /* Comment out the mother superior tracking. Will be debugged later if (pj->ji_wattr[JOB_ATR_exec_host].at_val.at_str != NULL) {*/ /* add job to the mother superior list for it's node */ /* char *ms = strdup(pj->ji_wattr[JOB_ATR_exec_host].at_val.at_str); char *end = strchr(ms, '/'); if (end != NULL) *end = '\0'; if ((end = strchr(ms, '+')) != NULL) *end = '\0'; add_to_ms_list(ms, pj); free(ms); }*/ #endif #ifdef PBS_MOM /* read in tm sockets and ips */ if (recov_tmsock(fds, pj) != 0) { snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "warning: tmsockets not recovered from %s (written by an older pbs_mom?)", namebuf); log_err(-1, __func__, log_buf); } #else /* not PBS_MOM */ if (strchr(pj->ji_qs.ji_jobid, '[') != NULL) { /* job is part of an array. We need to put a link back to the server job array struct for this array. We also have to link this job into the linked list of jobs belonging to the array. */ array_get_parent_id(pj->ji_qs.ji_jobid, parent_id); pa = get_array(parent_id); if (pa == NULL) { job_abt(&pj, (char *)"Array job missing array struct, aborting job"); close(fds); return NULL; } strcpy(pj->ji_arraystructid, parent_id); if (strcmp(parent_id, pj->ji_qs.ji_jobid) == 0) { pj->ji_is_array_template = TRUE; } else { pa->job_ids[(int)pj->ji_wattr[JOB_ATR_job_array_id].at_val.at_long] = strdup(pj->ji_qs.ji_jobid); pa->jobs_recovered++; /* This is a bit of a kluge, but for some reason if an array job was on hold when the server went down the ji_wattr[JOB_ATR_hold].at_val.at_long value is 0 on recovery even though pj->ji_qs.ji_state is JOB_STATE_HELD and the substate is JOB_SUBSTATE_HELD */ if ((pj->ji_qs.ji_state == JOB_STATE_HELD) && (pj->ji_qs.ji_substate == JOB_SUBSTATE_HELD)) { pj->ji_wattr[JOB_ATR_hold].at_val.at_long = HOLD_l; pj->ji_wattr[JOB_ATR_hold].at_flags = ATR_VFLAG_SET; } } if (pa != NULL) { unlock_ai_mutex(pa, __func__, "1", LOGLEVEL); } } #endif close(fds); pj->ji_commit_done = 1; /* all done recovering the job */ job_save(pj, SAVEJOB_FULL, 0); return(pj); } /* END job_recov() */
job *job_recov( char *filename) /* I */ /* pathname to job save file */ { int fds; job *pj; char *pn; char namebuf[MAXPATHLEN]; int qs_upgrade; #ifndef PBS_MOM char parent_id[PBS_MAXSVRJOBID + 1]; job_array *pa; #endif qs_upgrade = FALSE; pj = job_alloc(); /* allocate & initialize job structure space */ if (pj == NULL) { /* FAILURE - cannot alloc memory */ return(NULL); } strcpy(namebuf, path_jobs); /* job directory path */ strcat(namebuf, filename); fds = open(namebuf, O_RDONLY, 0); if (fds < 0) { sprintf(log_buffer, "unable to open %s", namebuf); log_err(errno, "job_recov", log_buffer); free((char *)pj); /* FAILURE - cannot open job file */ return(NULL); } /* read in job quick save sub-structure */ if (read(fds, (char *)&pj->ji_qs, quicksize) != (ssize_t)quicksize && pj->ji_qs.qs_version == PBS_QS_VERSION) { sprintf(log_buffer, "Unable to read %s", namebuf); log_err(errno, "job_recov", log_buffer); free((char *)pj); close(fds); return(NULL); } /* is ji_qs the version we expect? */ if (pj->ji_qs.qs_version != PBS_QS_VERSION) { /* ji_qs is older version */ sprintf(log_buffer, "%s appears to be from an old version. Attempting to convert.\n", namebuf); log_err(-1, "job_recov", log_buffer); if (job_qs_upgrade(pj, fds, namebuf, pj->ji_qs.qs_version) != 0) { sprintf(log_buffer, "unable to upgrade %s\n", namebuf); log_err(-1, "job_recov", log_buffer); free((char *)pj); close(fds); return(NULL); } qs_upgrade = TRUE; } /* END if (pj->ji_qs.qs_version != PBS_QS_VERSION) */ /* Does file name match the internal name? */ /* This detects ghost files */ pn = strrchr(namebuf, (int)'/') + 1; if (strncmp(pn, pj->ji_qs.ji_fileprefix, strlen(pj->ji_qs.ji_fileprefix)) != 0) { /* mismatch, discard job */ sprintf(log_buffer, "Job Id %s does not match file name for %s", pj->ji_qs.ji_jobid, namebuf); log_err(-1, "job_recov", log_buffer); free((char *)pj); close(fds); return(NULL); } /* read in working attributes */ if (recov_attr( fds, pj, job_attr_def, pj->ji_wattr, (int)JOB_ATR_LAST, (int)JOB_ATR_UNKN, TRUE) != 0) { sprintf(log_buffer, "unable to recover %s (file is likely corrupted)", namebuf); log_err(-1, "job_recov", log_buffer); job_free(pj); close(fds); return(NULL); } #ifdef PBS_MOM /* read in tm sockets and ips */ if (recov_tmsock(fds, pj) != 0) { sprintf(log_buffer, "warning: tmsockets not recovered from %s (written by an older pbs_mom?)", namebuf); log_err(-1, "job_recov", log_buffer); } if (recov_roottask(fds, pj) != 0) { sprintf(log_buffer, "warning: root task not recovered from %s (written by an older pbs_mom?)", namebuf); log_err(-1, "job_recov", log_buffer); } if (recov_jobflags(fds, pj) != 0) { sprintf(log_buffer, "warning: job flags not recovered from %s (written by an older pbs_mom?)", namebuf); log_err(-1, "job_recov", log_buffer); } #else /* PBS_MOM */ if (pj->ji_wattr[(int)JOB_ATR_job_array_request].at_flags & ATR_VFLAG_SET) { /* job is part of an array. We need to put a link back to the server job array struct for this array. We also have to link this job into the linked list of jobs belonging to the array. */ array_get_parent_id(pj->ji_qs.ji_jobid, parent_id); pa = get_array(parent_id); if (strcmp(parent_id, pj->ji_qs.ji_jobid) == 0) { pj->ji_isparent = TRUE; } else { if (pa == NULL) { /* couldn't find array struct, it must not have been recovered, treat job as indepentent job? perhaps we should delete the job XXX_JOB_ARRAY: should I unset this?*/ pj->ji_wattr[(int)JOB_ATR_job_array_request].at_flags &= ~ATR_VFLAG_SET; } else { CLEAR_LINK(pj->ji_arrayjobs); append_link(&pa->array_alljobs, &pj->ji_arrayjobs, (void*)pj); pj->ji_arraystruct = pa; pa->jobs_recovered++; } } } #endif close(fds); /* all done recovering the job */ if (qs_upgrade == TRUE) { job_save(pj, SAVEJOB_FULL); } return(pj); } /* END job_recov() */
job *job_recov( char *filename) /* I */ /* pathname to job save file */ { int fds; job *pj; char *pn; char namebuf[MAXPATHLEN]; int qs_upgrade; #ifndef PBS_MOM char parent_id[PBS_MAXSVRJOBID + 1]; job_array *pa; #endif qs_upgrade = FALSE; pj = job_alloc(); /* allocate & initialize job structure space */ if (pj == NULL) { /* FAILURE - cannot alloc memory */ return(NULL); } strcpy(namebuf, path_jobs); /* job directory path */ strcat(namebuf, filename); fds = open(namebuf, O_RDONLY, 0); if (fds < 0) { sprintf(log_buffer, "unable to open %s", namebuf); log_err(errno, "job_recov", log_buffer); free((char *)pj); /* FAILURE - cannot open job file */ return(NULL); } /* read in job quick save sub-structure */ if (read(fds, (char *)&pj->ji_qs, quicksize) != (ssize_t)quicksize && pj->ji_qs.qs_version == PBS_QS_VERSION) { sprintf(log_buffer, "Unable to read %s", namebuf); log_err(errno, "job_recov", log_buffer); free((char *)pj); close(fds); return(NULL); } /* is ji_qs the version we expect? */ if (pj->ji_qs.qs_version != PBS_QS_VERSION) { /* ji_qs is older version */ sprintf(log_buffer, "%s appears to be from an old version. Attempting to convert.\n", namebuf); log_err(-1, "job_recov", log_buffer); if (job_qs_upgrade(pj, fds, namebuf, pj->ji_qs.qs_version) != 0) { sprintf(log_buffer, "unable to upgrade %s\n", namebuf); log_err(-1, "job_recov", log_buffer); free((char *)pj); close(fds); return(NULL); } qs_upgrade = TRUE; } /* END if (pj->ji_qs.qs_version != PBS_QS_VERSION) */ /* Does file name match the internal name? */ /* This detects ghost files */ pn = strrchr(namebuf, (int)'/') + 1; if (strncmp(pn, pj->ji_qs.ji_fileprefix, strlen(pj->ji_qs.ji_fileprefix)) != 0) { /* mismatch, discard job */ sprintf(log_buffer, "Job Id %s does not match file name for %s", pj->ji_qs.ji_jobid, namebuf); log_err(-1, "job_recov", log_buffer); free((char *)pj); close(fds); return(NULL); } /* read in working attributes */ if (recov_attr( fds, pj, job_attr_def, pj->ji_wattr, (int)JOB_ATR_LAST, (int)JOB_ATR_UNKN, TRUE) != 0) { sprintf(log_buffer, "unable to recover %s (file is likely corrupted)", namebuf); log_err(-1, "job_recov", log_buffer); job_free(pj); close(fds); return(NULL); } #ifdef PBS_MOM /* read in tm sockets and ips */ if (recov_tmsock(fds, pj) != 0) { sprintf(log_buffer, "warning: tmsockets not recovered from %s (written by an older pbs_mom?)", namebuf); log_err(-1, "job_recov", log_buffer); } #else /* PBS_MOM */ if (pj->ji_wattr[(int)JOB_ATR_job_array_request].at_flags & ATR_VFLAG_SET) { /* job is part of an array. We need to put a link back to the server job array struct for this array. We also have to link this job into the linked list of jobs belonging to the array. */ array_get_parent_id(pj->ji_qs.ji_jobid, parent_id); pa = get_array(parent_id); if (pa == NULL) { job_abt(&pj, "Array job missing array struct, aborting job"); close(fds); return NULL; } if (strcmp(parent_id, pj->ji_qs.ji_jobid) == 0) { pj->ji_is_array_template = TRUE; pj->ji_arraystruct = pa; } else { pa->jobs[(int)pj->ji_wattr[JOB_ATR_job_array_id].at_val.at_long] = (void *)pj; pj->ji_arraystruct = pa; pa->jobs_recovered++; /* This is a bit of a kluge, but for some reason if an array job was on hold when the server went down the ji_wattr[JOB_ATR_hold].at_val.at_long value is 0 on recovery even though pj->ji_qs.ji_state is JOB_STATE_HELD and the substate is JOB_SUBSTATE_HELD */ if ((pj->ji_qs.ji_state == JOB_STATE_HELD) && (pj->ji_qs.ji_substate == JOB_SUBSTATE_HELD)) { pj->ji_wattr[JOB_ATR_hold].at_val.at_long = HOLD_l; pj->ji_wattr[JOB_ATR_hold].at_flags = ATR_VFLAG_SET; } } } #endif close(fds); /* all done recovering the job */ job_save(pj, SAVEJOB_FULL); return(pj); } /* END job_recov() */