/** * @brief * Function to migrate filesystem data to database. * Reads serverdb, scheddb, job files, node, nodestate, queue, resv information * from the filesystem and save them into the database. All the information is * recovered and saved into the database under a single database transaction, * so any failure rolls back all the updates to the database. If all the updates * to the database succeed, only then the respective files are deleted from the * filesystem, else no deletion takes place. * * @return Error code * @retval 0 : success * @retval -1 : Failure * */ int svr_migrate_data_from_fs(void) { int baselen; struct dirent *pdirent; DIR *dir; int had; char *job_suffix = JOB_FILE_SUFFIX; int job_suf_len = strlen(job_suffix); job *pjob = NULL; pbs_queue *pque; resc_resv *presv; char *psuffix; int rc; int recovered = 0; char basen[MAXPATHLEN+1]; char scrfile[MAXPATHLEN+1]; char jobfile[MAXPATHLEN+1]; char origdir[MAXPATHLEN+1]; int fd; struct stat stbuf; char *scrbuf = NULL; pbs_db_jobscr_info_t jobscr; pbs_db_obj_info_t obj; path_svrdb_new = build_path(path_priv, PBS_SERVERDB, new_tag); path_scheddb = build_path(path_priv, PBS_SCHEDDB, NULL); path_scheddb_new = build_path(path_priv, PBS_SCHEDDB, new_tag); path_queues = build_path(path_priv, PBS_QUEDIR, suffix_slash); path_resvs = build_path(path_priv, PBS_RESVDIR, suffix_slash); path_nodes = build_path(path_priv, NODE_DESCRIP, NULL); path_nodestate = build_path(path_priv, NODE_STATUS, NULL); /* If not a "create" initialization, recover server db */ /* and sched db */ if (chk_save_file(path_svrdb) != 0) { fprintf(stderr, "No serverdb found to update to datastore\n"); return (0); } if (setup_resc(1) == -1) { fprintf(stderr, "%s\n", log_buffer); (void) pbs_db_end_trx(svr_db_conn, PBS_DB_ROLLBACK); return (-1); } init_server_attrs(); /* start a database transation for the whole recovery */ if (pbs_db_begin_trx(svr_db_conn, 0, 0) != 0) return (-1); /* preprocess the nodes file to convert old properties to resources */ if (setup_nodes_fs(1) == -1) { fprintf(stderr, "%s\n", log_buffer); (void) pbs_db_end_trx(svr_db_conn, PBS_DB_ROLLBACK); return (-1); } /* Open the server database (save file) and read it in */ if (svr_recov_fs(path_svrdb) == -1) { fprintf(stderr, "%s\n", msg_init_baddb); (void) pbs_db_end_trx(svr_db_conn, PBS_DB_ROLLBACK); return (-1); } /* save server information to database now */ if (svr_save_db(&server, SVR_SAVE_NEW) != 0) { fprintf(stderr, "Could not save server db\n"); if (svr_db_conn->conn_db_err) fprintf(stderr, "[%s]\n", (char*)svr_db_conn->conn_db_err); (void) pbs_db_end_trx(svr_db_conn, PBS_DB_ROLLBACK); return (-1); } /* now do sched db */ if (sched_recov_fs(path_scheddb) == -1) { fprintf(stderr, "Unable to recover scheddb\n"); (void) pbs_db_end_trx(svr_db_conn, PBS_DB_ROLLBACK); return (-1); } if (sched_save_db(&scheduler, SVR_SAVE_NEW) != 0) { fprintf(stderr, "Could not save scheduler db\n"); if (svr_db_conn->conn_db_err) fprintf(stderr, "[%s]\n", (char*)svr_db_conn->conn_db_err); (void) pbs_db_end_trx(svr_db_conn, PBS_DB_ROLLBACK); return (-1); } /* save current working dir before any chdirs */ if (getcwd(origdir, MAXPATHLEN) == NULL) { fprintf(stderr, "getcwd failed\n"); (void) pbs_db_end_trx(svr_db_conn, PBS_DB_ROLLBACK); return (-1); } if (chdir(path_queues) != 0) { fprintf(stderr, msg_init_chdir, path_queues); fprintf(stderr, "\n"); (void) pbs_db_end_trx(svr_db_conn, PBS_DB_ROLLBACK); chdir(origdir); return (-1); } had = server.sv_qs.sv_numque; server.sv_qs.sv_numque = 0; dir = opendir("."); if (dir == (DIR *) 0) { fprintf(stderr, "%s\n", msg_init_noqueues); (void) pbs_db_end_trx(svr_db_conn, PBS_DB_ROLLBACK); chdir(origdir); return (-1); } while (errno = 0, (pdirent = readdir(dir)) != (struct dirent *) 0) { if (chk_save_file(pdirent->d_name) == 0) { if ((pque = que_recov_fs(pdirent->d_name)) != (pbs_queue *) 0) { /* que_recov increments sv_numque */ fprintf(stderr, msg_init_recovque, pque->qu_qs.qu_name); fprintf(stderr, "\n"); if (que_save_db(pque, QUE_SAVE_NEW) != 0) { fprintf(stderr, "Could not save queue info for queue %s\n", pque->qu_qs.qu_name); if (svr_db_conn->conn_db_err) fprintf(stderr, "[%s]\n", (char*)svr_db_conn->conn_db_err); (void) pbs_db_end_trx(svr_db_conn, PBS_DB_ROLLBACK); (void) closedir(dir); chdir(origdir); return (-1); } } } } if (errno != 0 && errno != ENOENT) { fprintf(stderr, "%s\n", msg_init_noqueues); (void) pbs_db_end_trx(svr_db_conn, PBS_DB_ROLLBACK); (void) closedir(dir); chdir(origdir); return (-1); } (void) closedir(dir); if (had != server.sv_qs.sv_numque) { fprintf(stderr, msg_init_expctq, had, server.sv_qs.sv_numque); fprintf(stderr, "\n"); } /* Open and read in node list if one exists */ if (setup_nodes_fs(0) == -1) { fprintf(stderr, "%s\n", log_buffer); (void) pbs_db_end_trx(svr_db_conn, PBS_DB_ROLLBACK); chdir(origdir); return (-1); } /* * Recover reservations. */ if (chdir(path_resvs) != 0) { fprintf(stderr, msg_init_chdir, path_resvs); fprintf(stderr, "\n"); (void) pbs_db_end_trx(svr_db_conn, PBS_DB_ROLLBACK); chdir(origdir); return (-1); } dir = opendir("."); if (dir == (DIR *) 0) { fprintf(stderr, "%s\n", msg_init_noresvs); (void) pbs_db_end_trx(svr_db_conn, PBS_DB_ROLLBACK); chdir(origdir); return (-1); } while (errno = 0, (pdirent = readdir(dir)) != (struct dirent *) 0) { if (chk_save_file(pdirent->d_name) == 0) { presv = (resc_resv *) job_or_resv_recov_fs(pdirent->d_name, RESC_RESV_OBJECT); if (presv != (resc_resv *) 0) { if (resv_save_db(presv, SAVERESV_NEW) != 0) { fprintf(stderr, "Could not save resv info for resv %s\n", presv->ri_qs.ri_resvID); if (svr_db_conn->conn_db_err) fprintf(stderr, "[%s]\n", (char*)svr_db_conn->conn_db_err); (void) pbs_db_end_trx(svr_db_conn, PBS_DB_ROLLBACK); (void) closedir(dir); chdir(origdir); return (-1); } } } } if (errno != 0 && errno != ENOENT) { fprintf(stderr, "%s\n", msg_init_noresvs); (void) pbs_db_end_trx(svr_db_conn, PBS_DB_ROLLBACK); (void) closedir(dir); chdir(origdir); return (-1); } (void) closedir(dir); /* * Recover jobs */ if (chdir(path_jobs) != 0) { fprintf(stderr, msg_init_chdir, path_jobs); fprintf(stderr, "\n"); chdir(origdir); return (-1); } had = server.sv_qs.sv_numjobs; server.sv_qs.sv_numjobs = 0; recovered = 0; dir = opendir("."); if (dir == (DIR *) 0) { if (had == 0) { fprintf(stderr, msg_init_nojobs); } else { fprintf(stderr, msg_init_exptjobs, had, 0); } fprintf(stderr, "\n"); } else { /* Now, for each job found ... */ while (errno = 0, (pdirent = readdir(dir)) != (struct dirent *) 0) { if (chk_save_file(pdirent->d_name) != 0) continue; /* recover the job */ baselen = strlen(pdirent->d_name) - job_suf_len; psuffix = pdirent->d_name + baselen; if (strcmp(psuffix, job_suffix)) continue; if ((pjob = job_recov_fs(pdirent->d_name, RECOV_SUBJOB)) == NULL) { (void)strcpy(basen, pdirent->d_name); psuffix = basen + baselen; (void)strcpy(psuffix, JOB_BAD_SUFFIX); (void)snprintf(log_buffer, sizeof(log_buffer), "moved bad file to %s", basen); log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, LOG_NOTICE, msg_daemonname, log_buffer); continue; } if (pjob->ji_qs.ji_svrflags & JOB_SVFLG_SCRIPT) { /* load the job script file */ strcpy(scrfile, path_jobs); #ifndef WIN32 /* under WIN32, there's already a prefixed '/' */ (void) strcat(scrfile, "/"); #endif strcat(scrfile, pdirent->d_name); baselen = strlen(scrfile) - strlen(JOB_FILE_SUFFIX); scrfile[baselen] = 0; /* put null char */ strcat(scrfile, JOB_SCRIPT_SUFFIX); rc = 1; #ifdef WIN32 if ((fd = open(scrfile, O_BINARY | O_RDONLY)) != -1) #else if ((fd = open(scrfile, O_RDONLY)) != -1) #endif { /* load the script */ if (fstat(fd, &stbuf) == 0) { if ((scrbuf = malloc(stbuf.st_size + 1))) { if (read(fd, scrbuf, stbuf.st_size) == stbuf.st_size) { scrbuf[stbuf.st_size] = '\0'; /* null character */ rc = 0; /* success loading */ } } } close(fd); } if (rc != 0) { fprintf(stderr, "Could not recover script file for job %s\n", pjob->ji_qs.ji_jobid); (void) strcpy(basen, scrfile); psuffix = basen + strlen(scrfile) - strlen(JOB_SCRIPT_SUFFIX); (void) strcpy(psuffix, JOB_BAD_SUFFIX); (void) strcpy(jobfile, scrfile); psuffix = jobfile + strlen(jobfile) - strlen(JOB_SCRIPT_SUFFIX); (void) strcpy(psuffix, JOB_FILE_SUFFIX); #ifdef WIN32 if (MoveFileEx(jobfile, basen, MOVEFILE_REPLACE_EXISTING | MOVEFILE_WRITE_THROUGH) == 0) { errno = GetLastError(); snprintf(log_buffer, sizeof(log_buffer), "MoveFileEx(%s, %s) failed!", jobfile, basen); log_err(errno, "script", log_buffer); } secure_file(basen, "Administrators", READS_MASK | WRITES_MASK | STANDARD_RIGHTS_REQUIRED); #else if (rename(jobfile, basen) == -1) { snprintf(log_buffer, sizeof(log_buffer), "error renaming job file %s", jobfile); log_err(errno, "job_recov", log_buffer); } #endif (void) snprintf(log_buffer, sizeof(log_buffer), "moved bad file to %s", basen); log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, LOG_NOTICE, msg_daemonname, log_buffer); free(scrbuf); scrbuf = NULL; continue; } } /* now save job first */ if (job_save_db(pjob, SAVEJOB_NEW) != 0) { fprintf(stderr, "Could not save job info for jobid %s\n", pjob->ji_qs.ji_jobid); if (svr_db_conn->conn_db_err) fprintf(stderr, "[%s]\n", (char*)svr_db_conn->conn_db_err); (void) pbs_db_end_trx(svr_db_conn, PBS_DB_ROLLBACK); (void) closedir(dir); chdir(origdir); free(scrbuf); return (-1); } if (pjob->ji_qs.ji_svrflags & JOB_SVFLG_SCRIPT) { /* save job script */ strcpy(jobscr.ji_jobid, pjob->ji_qs.ji_jobid); jobscr.script = scrbuf; obj.pbs_db_obj_type = PBS_DB_JOBSCR; obj.pbs_db_un.pbs_db_jobscr = &jobscr; if (pbs_db_insert_obj(svr_db_conn, &obj) != 0) { fprintf(stderr, "Could not save job script for jobid %s\n", pjob->ji_qs.ji_jobid); if (svr_db_conn->conn_db_err) fprintf(stderr, "[%s]\n", (char*)svr_db_conn->conn_db_err); (void) pbs_db_end_trx(svr_db_conn, PBS_DB_ROLLBACK); free(scrbuf); (void) closedir(dir); chdir(origdir); return (-1); } free(scrbuf); scrbuf = NULL; } recovered++; } if (errno != 0 && errno != ENOENT) { if (pjob) fprintf(stderr, "readdir error for jobid %s\n", pjob->ji_qs.ji_jobid); else fprintf(stderr, "readdir error\n"); (void) pbs_db_end_trx(svr_db_conn, PBS_DB_ROLLBACK); free(scrbuf); (void) closedir(dir); chdir(origdir); return (-1); } (void) closedir(dir); if (had != recovered) { fprintf(stderr, msg_init_exptjobs, had, recovered); fprintf(stderr, "\n"); } } if (save_nodes_db(0) != 0) { fprintf(stderr, "Could not save nodes\n"); if (svr_db_conn->conn_db_err) fprintf(stderr, "[%s]\n", (char*)svr_db_conn->conn_db_err); (void) pbs_db_end_trx(svr_db_conn, PBS_DB_ROLLBACK); chdir(origdir); return (-1); } if (pbs_db_end_trx(svr_db_conn, PBS_DB_COMMIT) == 0) { rm_migrated_files(path_priv); chdir(origdir); return (0); } chdir(origdir); return -1; }
/** * @brief * main - the initialization and main loop of pbs_daemon */ int main(int argc, char *argv[]) { char jobfile[MAXPATHLEN+1]; char jobfile_full[MAXPATHLEN+1]; pbs_net_t hostaddr = 0; int port = -1; int move_type = -1; pbs_list_head attrl; enum conn_type cntype = ToServerDIS; int con = -1; char *destin; int encode_type; int i; job *jobp; char job_id[PBS_MAXSVRJOBID+1]; attribute *pattr; struct attropl *pqjatr; /* list (single) of attropl for quejob */ char script_name[MAXPATHLEN+1]; int in_server = -1; char *param_name, *param_val; char buf[4096]; struct hostent *hp; struct in_addr addr; char *credbuf = NULL; size_t credlen = 0; int prot = PROT_TCP; /*the real deal or output version and exit?*/ execution_mode(argc, argv); /* If we are not run with real and effective uid of 0, forget it */ pbs_loadconf(0); if (!isAdminPrivilege(getlogin())) { fprintf(stderr, "%s: Must be run by root\n", argv[0]); exit(SEND_JOB_FATAL); } /* initialize the pointers in the resource_def array */ for (i = 0; i < (svr_resc_size - 1); ++i) svr_resc_def[i].rs_next = &svr_resc_def[i+1]; /* last entry is left with null pointer */ /* set single threaded mode */ pbs_client_thread_set_single_threaded_mode(); /* disable attribute verification */ set_no_attribute_verification(); /* initialize the thread context */ if (pbs_client_thread_init_thread_context() != 0) { fprintf(stderr, "%s: Unable to initialize thread context\n", argv[0]); exit(SEND_JOB_FATAL); } if(set_msgdaemonname("PBS_send_job")) { fprintf(stderr, "Out of memory\n"); return 1; } winsock_init(); connection_init(); while (fgets(buf, sizeof(buf), stdin) != NULL) { buf[strlen(buf)-1] = '\0'; /* gets rid of newline */ param_name = buf; param_val = strchr(buf, '='); if (param_val) { *param_val = '\0'; param_val++; } else { /* bad param_val -- skipping */ break; } if (strcmp(param_name, "jobfile") == 0) { jobfile[0] = '\0'; strncpy(jobfile, param_val, MAXPATHLEN); } else if (strcmp(param_name, "destaddr") == 0) { hostaddr = atol(param_val); } else if (strcmp(param_name, "destport") == 0) { port = atoi(param_val); } else if (strcmp(param_name, "move_type") == 0) { move_type = atoi(param_val); } else if (strcmp(param_name, "in_server") == 0) { in_server = atoi(param_val); } else if (strcmp(param_name, "server_name") == 0) { server_name[0] = '\0'; strncpy(server_name, param_val, PBS_MAXSERVERNAME); } else if (strcmp(param_name, "server_host") == 0) { server_host[0] = '\0'; strncpy(server_host, param_val, (sizeof(server_host) - 1)); } else if (strcmp(param_name, "server_addr") == 0) { pbs_server_addr = atol(param_val); } else if (strcmp(param_name, "server_port") == 0) { pbs_server_port_dis = atoi(param_val); } else if (strcmp(param_name, "log_file") == 0) { log_file = strdup(param_val); } else if (strcmp(param_name, "path_log") == 0) { path_log[0] = '\0'; strncpy(path_log, param_val, MAXPATHLEN); } else if (strcmp(param_name, "path_jobs") == 0) { path_jobs = strdup(param_val); } else if (strcmp(param_name, "path_spool") == 0) { path_spool = strdup(param_val); } else if (strcmp(param_name, "path_rescdef") == 0) { path_rescdef = strdup(param_val); } else if (strcmp(param_name, "path_users") == 0) { path_users = strdup(param_val); } else if (strcmp(param_name, "path_hooks_workdir") == 0) { path_hooks_workdir = strdup(param_val); if (path_hooks_workdir == NULL) exit(SEND_JOB_FATAL); } else if (strcmp(param_name, "svr_history_enable") == 0) { svr_history_enable = atol(param_val); } else if (strcmp(param_name, "svr_history_duration") == 0) { svr_history_duration = atol(param_val); } else if (strcmp(param_name, "single_signon_password_enable") == 0) { if (decode_b(&server.sv_attr[(int)SRV_ATR_ssignon_enable], NULL, NULL, param_val) != 0) { fprintf(stderr, "%s: failed to set ssignon_password_enable\n", argv[0]); exit(SEND_JOB_FATAL); } } else if (strcmp(param_name, "script_name") == 0) { strncpy(script_name, param_val, MAXPATHLEN + 1); } else break; } time(&time_now); (void)log_open_main(log_file, path_log, 1); /* silent open */ if (setup_resc(1) == -1) { /* log_buffer set in setup_resc */ log_err(-1, "pbsd_send_job(setup_resc)", log_buffer); return (-1); } if( strlen(jobfile) == 0 || hostaddr == 0 || port == 0 || move_type == -1 || \ in_server == -1 || strlen(server_name) == 0 || strlen(server_host) == 0 || \ pbs_server_addr == 0 || pbs_server_port_dis == 0 || \ strlen(path_log) == 0 || path_jobs == NULL || \ path_spool == NULL || path_users == NULL ) { log_err(-1, "pbs_send_job", "error on one of the parameters"); log_close(0); /* silent close */ exit(SEND_JOB_FATAL); } CLEAR_HEAD(task_list_immed); CLEAR_HEAD(task_list_timed); CLEAR_HEAD(task_list_event); CLEAR_HEAD(svr_queues); CLEAR_HEAD(svr_alljobs); CLEAR_HEAD(svr_newjobs); CLEAR_HEAD(svr_allresvs); CLEAR_HEAD(svr_newresvs); CLEAR_HEAD(svr_deferred_req); CLEAR_HEAD(svr_unlicensedjobs); strcpy(jobfile_full, path_jobs); strcat(jobfile_full, jobfile); if (chk_save_file(jobfile_full) != 0) { sprintf(log_buffer, "Error opening jobfile=%s", jobfile); log_err(-1, __func__, log_buffer); goto fatal_exit; } if ((jobp=job_recov_fs(jobfile, RECOV_SUBJOB)) == NULL) { sprintf(log_buffer, "Failed to recreate job in jobfile=%s", jobfile); log_err(-1, __func__, log_buffer); goto fatal_exit; } /* now delete the temp job file that was created by job_save_fs in server code * jobs are in database now, no need to keep in filesystem */ unlink(jobfile_full); if (in_server) append_link(&svr_alljobs, &jobp->ji_alljobs, jobp); /* select attributes/resources to send based on move type */ if (move_type == MOVE_TYPE_Exec) { resc_access_perm = ATR_DFLAG_MOM; encode_type = ATR_ENCODE_MOM; cntype = ToServerDIS; } else { resc_access_perm = ATR_DFLAG_USWR | ATR_DFLAG_OPWR | ATR_DFLAG_MGWR | ATR_DFLAG_SvRD; encode_type = ATR_ENCODE_SVR; svr_dequejob(jobp); } CLEAR_HEAD(attrl); pattr = jobp->ji_wattr; for (i=0; i < (int)JOB_ATR_LAST; i++) { if ((job_attr_def+i)->at_flags & resc_access_perm) { (void)(job_attr_def+i)->at_encode(pattr+i, &attrl, (job_attr_def+i)->at_name, NULL, encode_type, NULL); } } attrl_fixlink(&attrl); /* script name is passed from parent */ /* get host name */ pbs_loadconf(0); addr.s_addr = htonl(hostaddr); hp = gethostbyaddr((void *)&addr, sizeof(struct in_addr), AF_INET); if (hp == NULL) { sprintf(log_buffer, "%s: h_errno=%d", inet_ntoa(addr), h_errno); log_err(-1, __func__, log_buffer); } else { /* read any credential file */ (void)get_credential(hp->h_name, jobp, PBS_GC_BATREQ, &credbuf, &credlen); } /* save the job id for when after we purge the job */ (void)strcpy(job_id, jobp->ji_qs.ji_jobid); con = -1; DIS_tcparray_init(); for (i=0; i<RETRY; i++) { pbs_errno = 0; /* connect to receiving server with retries */ if (i > 0) { /* recycle after an error */ if (con >= 0) svr_disconnect(con); if (should_retry_route(pbs_errno) == -1) { goto fatal_exit; /* fatal error, don't retry */ } sleep(1<<i); } if ((con = svr_connect(hostaddr, port, 0, cntype, prot)) == PBS_NET_RC_FATAL) { (void)sprintf(log_buffer, "send_job failed to %lx port %d", hostaddr, port); log_err(pbs_errno, __func__, log_buffer); goto fatal_exit; } else if (con == PBS_NET_RC_RETRY) { pbs_errno = WSAECONNREFUSED; /* should retry */ continue; } /* * if the job is substate JOB_SUBSTATE_TRNOUTCM which means * we are recovering after being down or a late failure, we * just want to send the "read-to-commit/commit" */ if (jobp->ji_qs.ji_substate != JOB_SUBSTATE_TRNOUTCM) { if (jobp->ji_qs.ji_substate != JOB_SUBSTATE_TRNOUT) { jobp->ji_qs.ji_substate = JOB_SUBSTATE_TRNOUT; } pqjatr = &((svrattrl *)GET_NEXT(attrl))->al_atopl; destin = jobp->ji_qs.ji_destin; if (PBSD_queuejob(con, jobp->ji_qs.ji_jobid, destin, pqjatr, NULL, prot, NULL)== 0) { if (pbs_errno == PBSE_JOBEXIST && move_type == MOVE_TYPE_Exec) { /* already running, mark it so */ log_event(PBSEVENT_ERROR, PBS_EVENTCLASS_JOB, LOG_INFO, jobp->ji_qs.ji_jobid, "Mom reports job already running"); goto ok_exit; } else if ((pbs_errno == PBSE_HOOKERROR) || (pbs_errno == PBSE_HOOK_REJECT) || (pbs_errno == PBSE_HOOK_REJECT_RERUNJOB) || (pbs_errno == PBSE_HOOK_REJECT_DELETEJOB)) { char name_buf[MAXPATHLEN+1]; int rfd; int len; char *reject_msg; int err; err = pbs_errno; reject_msg = pbs_geterrmsg(con); (void)snprintf(log_buffer, sizeof(log_buffer), "send of job to %s failed error = %d reject_msg=%s", destin, err, reject_msg?reject_msg:""); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, LOG_INFO, jobp->ji_qs.ji_jobid, log_buffer); (void)strcpy(name_buf, path_hooks_workdir); (void)strcat(name_buf, jobp->ji_qs.ji_jobid); (void)strcat(name_buf, HOOK_REJECT_SUFFIX); if ((reject_msg != NULL) && (reject_msg[0] != '\0')) { if ((rfd = open(name_buf, O_RDWR|O_CREAT|O_TRUNC, 0600)) == -1) { snprintf(log_buffer, sizeof(log_buffer), "open of reject file %s failed: errno %d", name_buf, errno); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, LOG_INFO, jobp->ji_qs.ji_jobid, log_buffer); } else { secure_file(name_buf, "Administrators", READS_MASK|WRITES_MASK|STANDARD_RIGHTS_REQUIRED); setmode(rfd, O_BINARY); len = strlen(reject_msg)+1; /* write also trailing null char */ if (write(rfd, reject_msg, len) != len) { snprintf(log_buffer, sizeof(log_buffer), "write to file %s incomplete: errno %d", name_buf, errno); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, LOG_INFO, jobp->ji_qs.ji_jobid, log_buffer); } close(rfd); } } if (err == PBSE_HOOKERROR) exit(SEND_JOB_HOOKERR); if (err == PBSE_HOOK_REJECT) exit(SEND_JOB_HOOK_REJECT); if (err == PBSE_HOOK_REJECT_RERUNJOB) exit(SEND_JOB_HOOK_REJECT_RERUNJOB); if (err == PBSE_HOOK_REJECT_DELETEJOB) exit(SEND_JOB_HOOK_REJECT_DELETEJOB); } else { (void)sprintf(log_buffer, "send of job to %s failed error = %d", destin, pbs_errno); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, LOG_INFO, jobp->ji_qs.ji_jobid, log_buffer); continue; } } if (jobp->ji_qs.ji_svrflags & JOB_SVFLG_SCRIPT) { if (PBSD_jscript(con, script_name, prot, NULL) != 0) continue; } if (credlen > 0) { int ret; ret = PBSD_jcred(con, jobp->ji_extended.ji_ext.ji_credtype, credbuf, credlen, prot, NULL); if ((ret == 0) || (i == (RETRY - 1))) free(credbuf); /* free credbuf if credbuf is sent successfully OR */ /* at the end of all retry attempts */ if (ret != 0) continue; } if ((move_type == MOVE_TYPE_Exec) && (jobp->ji_qs.ji_svrflags & JOB_SVFLG_HASRUN) && (hostaddr != pbs_server_addr)) { /* send files created on prior run */ if ((move_job_file(con, jobp, StdOut, prot) != 0) || (move_job_file(con, jobp, StdErr, prot) != 0) || (move_job_file(con, jobp, Chkpt, prot) != 0)) continue; } jobp->ji_qs.ji_substate = JOB_SUBSTATE_TRNOUTCM; } if (PBSD_rdytocmt(con, job_id, prot, NULL) != 0) continue; if (PBSD_commit(con, job_id, prot, NULL) != 0) goto fatal_exit; goto ok_exit; /* This child process is all done */ } if (con >= 0) svr_disconnect(con); /* * If connection is actively refused by the execution node(or mother superior) OR * the execution node(or mother superior) is rejecting request with error * PBSE_BADHOST(failing to authorize server host), the node should be marked down. */ if ((move_type == MOVE_TYPE_Exec) && (pbs_errno == WSAECONNREFUSED || pbs_errno == PBSE_BADHOST)) { i = SEND_JOB_NODEDW; } else if (should_retry_route(pbs_errno) == -1) { i = SEND_JOB_FATAL; } else { i = SEND_JOB_RETRY; } (void)sprintf(log_buffer, "send_job failed with error %d", pbs_errno); log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, LOG_NOTICE, jobp->ji_qs.ji_jobid, log_buffer); log_close(0); net_close(-1); unlink(script_name); exit(i); fatal_exit: if (con >= 0) svr_disconnect(con); log_close(0); net_close(-1); unlink(script_name); exit(SEND_JOB_FATAL); ok_exit: if (con >= 0) svr_disconnect(con); log_close(0); net_close(-1); unlink(script_name); exit(SEND_JOB_OK); }