Exemplo n.º 1
0
int
poke_scheduler(attribute *pattr, void *pobj, int actmode)
{
	if (pobj == &server || pobj == dflt_scheduler) {
		if (pobj == &server) {
			/* set this attribute on main scheduler */
			if (dflt_scheduler) {
				sched_attr_def[(int) SCHED_ATR_scheduling].at_set(&dflt_scheduler->sch_attr[(int) SCHED_ATR_scheduling], pattr, SET);
				(void)sched_save_db(dflt_scheduler, SVR_SAVE_FULL);
			}
		} else {
			svr_attr_def[(int) SRV_ATR_scheduling].at_set(&server.sv_attr[SRV_ATR_scheduling], pattr, SET);
			svr_save_db(&server, SVR_SAVE_QUICK);
		}
		if (actmode == ATR_ACTION_ALTER) {
			if (pattr->at_val.at_long)
				set_scheduler_flag(SCH_SCHEDULE_CMD, dflt_scheduler);
		}
	} else {
		if (actmode == ATR_ACTION_ALTER) {
			if (pattr->at_val.at_long)
				set_scheduler_flag(SCH_SCHEDULE_CMD, (pbs_sched *)pobj);
		}
	}
	return PBSE_NONE;
}
Exemplo n.º 2
0
/**
 * @brief
 * 		action routine for the sched's "sched_iteration" attribute
 *
 * @param[in]	pattr	-	attribute being set
 * @param[in]	pobj	-	Object on which attribute is being set
 * @param[in]	actmode	-	the mode of setting, recovery or just alter
 *
 * @return	error code
 * @retval	PBSE_NONE	-	Success
 * @retval	!PBSE_NONE	-	Failure
 *
 */
int
action_sched_iteration(attribute *pattr, void *pobj, int actmode)
{
	if (pobj == dflt_scheduler) {
			server.sv_attr[SRV_ATR_scheduler_iteration].at_val.at_long = pattr->at_val.at_long;
			server.sv_attr[SRV_ATR_scheduler_iteration].at_flags |= ATR_VFLAG_SET | ATR_VFLAG_MODIFY | ATR_VFLAG_MODCACHE;
			svr_save_db(&server, SVR_SAVE_FULL);
	}
	return PBSE_NONE;
}
Exemplo n.º 3
0
/**
 * @brief
 *		Function to migrate filesystem data to database.
 * 		Reads serverdb, scheddb, job files, node, nodestate, queue, resv information
 * 		from the filesystem and save them into the database. All the information is
 * 		recovered and saved into the database under a single database transaction,
 * 		so any failure rolls back all the updates to the database. If all the updates
 * 		to the database succeed, only then the respective files are deleted from the
 * 		filesystem, else no deletion takes place.
 *
 * @return	Error code
 * @retval	0	: success
 * @retval	-1	: Failure
 *
 */
int
svr_migrate_data_from_fs(void)
{
	int baselen;
	struct dirent *pdirent;
	DIR *dir;
	int had;
	char *job_suffix = JOB_FILE_SUFFIX;
	int job_suf_len = strlen(job_suffix);
	job *pjob = NULL;
	pbs_queue *pque;
	resc_resv *presv;
	char *psuffix;
	int rc;
	int recovered = 0;
	char    basen[MAXPATHLEN+1];
	char	scrfile[MAXPATHLEN+1];
	char	jobfile[MAXPATHLEN+1];
	char	origdir[MAXPATHLEN+1];
	int fd;
	struct stat stbuf;
	char *scrbuf = NULL;
	pbs_db_jobscr_info_t	jobscr;
	pbs_db_obj_info_t		obj;

	path_svrdb_new = build_path(path_priv, PBS_SERVERDB, new_tag);
	path_scheddb = build_path(path_priv, PBS_SCHEDDB, NULL);
	path_scheddb_new = build_path(path_priv, PBS_SCHEDDB, new_tag);
	path_queues = build_path(path_priv, PBS_QUEDIR, suffix_slash);
	path_resvs = build_path(path_priv, PBS_RESVDIR, suffix_slash);
	path_nodes = build_path(path_priv, NODE_DESCRIP, NULL);
	path_nodestate = build_path(path_priv, NODE_STATUS, NULL);

	/*    If not a "create" initialization, recover server db */
	/*    and sched db					  */
	if (chk_save_file(path_svrdb) != 0) {
		fprintf(stderr, "No serverdb found to update to datastore\n");
		return (0);
	}

	if (setup_resc(1) == -1) {
		fprintf(stderr, "%s\n", log_buffer);
		(void) pbs_db_end_trx(svr_db_conn, PBS_DB_ROLLBACK);
		return (-1);
	}

	init_server_attrs();

	/* start a database transation for the whole recovery */
	if (pbs_db_begin_trx(svr_db_conn, 0, 0) != 0)
		return (-1);

	/* preprocess the nodes file to convert old properties to resources */
	if (setup_nodes_fs(1) == -1) {
		fprintf(stderr, "%s\n", log_buffer);
		(void) pbs_db_end_trx(svr_db_conn, PBS_DB_ROLLBACK);
		return (-1);
	}

	/* Open the server database (save file) and read it in */
	if (svr_recov_fs(path_svrdb) == -1) {
		fprintf(stderr, "%s\n", msg_init_baddb);
		(void) pbs_db_end_trx(svr_db_conn, PBS_DB_ROLLBACK);
		return (-1);
	}

	/* save server information to database now */
	if (svr_save_db(&server, SVR_SAVE_NEW) != 0) {
		fprintf(stderr, "Could not save server db\n");
		if (svr_db_conn->conn_db_err)
			fprintf(stderr, "[%s]\n", (char*)svr_db_conn->conn_db_err);
		(void) pbs_db_end_trx(svr_db_conn, PBS_DB_ROLLBACK);
		return (-1);
	}

	/* now do sched db */
	if (sched_recov_fs(path_scheddb) == -1) {
		fprintf(stderr, "Unable to recover scheddb\n");
		(void) pbs_db_end_trx(svr_db_conn, PBS_DB_ROLLBACK);
		return (-1);
	}

	if (sched_save_db(&scheduler, SVR_SAVE_NEW) != 0) {
		fprintf(stderr, "Could not save scheduler db\n");
		if (svr_db_conn->conn_db_err)
			fprintf(stderr, "[%s]\n", (char*)svr_db_conn->conn_db_err);
		(void) pbs_db_end_trx(svr_db_conn, PBS_DB_ROLLBACK);
		return (-1);
	}

	/* save current working dir before any chdirs */
	if (getcwd(origdir, MAXPATHLEN) == NULL) {
		fprintf(stderr, "getcwd failed\n");
		(void) pbs_db_end_trx(svr_db_conn, PBS_DB_ROLLBACK);
		return (-1);
	}

	if (chdir(path_queues) != 0) {
		fprintf(stderr, msg_init_chdir, path_queues);
		fprintf(stderr, "\n");
		(void) pbs_db_end_trx(svr_db_conn, PBS_DB_ROLLBACK);
		chdir(origdir);
		return (-1);
	}

	had = server.sv_qs.sv_numque;
	server.sv_qs.sv_numque = 0;
	dir = opendir(".");
	if (dir == (DIR *) 0) {
		fprintf(stderr, "%s\n", msg_init_noqueues);
		(void) pbs_db_end_trx(svr_db_conn, PBS_DB_ROLLBACK);
		chdir(origdir);
		return (-1);
	}
	while (errno = 0, (pdirent = readdir(dir)) != (struct dirent *) 0) {
		if (chk_save_file(pdirent->d_name) == 0) {
			if ((pque = que_recov_fs(pdirent->d_name)) !=
				(pbs_queue *) 0) {
				/* que_recov increments sv_numque */
				fprintf(stderr, msg_init_recovque,
					pque->qu_qs.qu_name);
				fprintf(stderr, "\n");
				if (que_save_db(pque, QUE_SAVE_NEW) != 0) {
					fprintf(stderr,
						"Could not save queue info for queue %s\n",
						pque->qu_qs.qu_name);
					if (svr_db_conn->conn_db_err)
						fprintf(stderr, "[%s]\n", (char*)svr_db_conn->conn_db_err);
					(void) pbs_db_end_trx(svr_db_conn,
						PBS_DB_ROLLBACK);
					(void) closedir(dir);
					chdir(origdir);
					return (-1);
				}
			}
		}
	}
	if (errno != 0 && errno != ENOENT) {
		fprintf(stderr, "%s\n", msg_init_noqueues);
		(void) pbs_db_end_trx(svr_db_conn, PBS_DB_ROLLBACK);
		(void) closedir(dir);
		chdir(origdir);
		return (-1);
	}
	(void) closedir(dir);
	if (had != server.sv_qs.sv_numque) {
		fprintf(stderr, msg_init_expctq, had, server.sv_qs.sv_numque);
		fprintf(stderr, "\n");
	}

	/* Open and read in node list if one exists */
	if (setup_nodes_fs(0) == -1) {
		fprintf(stderr, "%s\n", log_buffer);
		(void) pbs_db_end_trx(svr_db_conn, PBS_DB_ROLLBACK);
		chdir(origdir);
		return (-1);
	}

	/*
	 * Recover reservations.
	 */
	if (chdir(path_resvs) != 0) {
		fprintf(stderr, msg_init_chdir, path_resvs);
		fprintf(stderr, "\n");
		(void) pbs_db_end_trx(svr_db_conn, PBS_DB_ROLLBACK);
		chdir(origdir);
		return (-1);
	}

	dir = opendir(".");
	if (dir == (DIR *) 0) {
		fprintf(stderr, "%s\n", msg_init_noresvs);
		(void) pbs_db_end_trx(svr_db_conn, PBS_DB_ROLLBACK);
		chdir(origdir);
		return (-1);
	}
	while (errno = 0, (pdirent = readdir(dir)) != (struct dirent *) 0) {
		if (chk_save_file(pdirent->d_name) == 0) {
			presv = (resc_resv *)
				job_or_resv_recov_fs(pdirent->d_name,
				RESC_RESV_OBJECT);
			if (presv != (resc_resv *) 0) {
				if (resv_save_db(presv, SAVERESV_NEW) != 0) {
					fprintf(stderr,
						"Could not save resv info for resv %s\n",
						presv->ri_qs.ri_resvID);
					if (svr_db_conn->conn_db_err)
						fprintf(stderr, "[%s]\n", (char*)svr_db_conn->conn_db_err);
					(void) pbs_db_end_trx(svr_db_conn,
						PBS_DB_ROLLBACK);
					(void) closedir(dir);
					chdir(origdir);
					return (-1);
				}
			}
		}
	}
	if (errno != 0 && errno != ENOENT) {
		fprintf(stderr, "%s\n", msg_init_noresvs);
		(void) pbs_db_end_trx(svr_db_conn, PBS_DB_ROLLBACK);
		(void) closedir(dir);
		chdir(origdir);
		return (-1);
	}
	(void) closedir(dir);

	/*
	 *   Recover jobs
	 */
	if (chdir(path_jobs) != 0) {
		fprintf(stderr, msg_init_chdir, path_jobs);
		fprintf(stderr, "\n");
		chdir(origdir);
		return (-1);
	}

	had = server.sv_qs.sv_numjobs;
	server.sv_qs.sv_numjobs = 0;
	recovered = 0;
	dir = opendir(".");
	if (dir == (DIR *) 0) {
		if (had == 0) {
			fprintf(stderr, msg_init_nojobs);
		} else {
			fprintf(stderr, msg_init_exptjobs, had, 0);
		}
		fprintf(stderr, "\n");
	} else {
		/* Now, for each job found ... */
		while (errno = 0,
			(pdirent = readdir(dir)) != (struct dirent *) 0) {
			if (chk_save_file(pdirent->d_name) != 0)
				continue;

			/* recover the job */
			baselen = strlen(pdirent->d_name) - job_suf_len;
			psuffix = pdirent->d_name + baselen;
			if (strcmp(psuffix, job_suffix))
				continue;

			if ((pjob = job_recov_fs(pdirent->d_name,  RECOV_SUBJOB)) == NULL) {
				(void)strcpy(basen, pdirent->d_name);
				psuffix = basen + baselen;
				(void)strcpy(psuffix, JOB_BAD_SUFFIX);
				(void)snprintf(log_buffer, sizeof(log_buffer), "moved bad file to %s",
					basen);
				log_event(PBSEVENT_SYSTEM,
					PBS_EVENTCLASS_SERVER, LOG_NOTICE,
					msg_daemonname, log_buffer);
				continue;
			}

			if (pjob->ji_qs.ji_svrflags & JOB_SVFLG_SCRIPT) {
				/* load the job script file */
				strcpy(scrfile, path_jobs);
#ifndef WIN32
				/* under WIN32, there's already a prefixed '/' */
				(void) strcat(scrfile, "/");
#endif
				strcat(scrfile, pdirent->d_name);
				baselen = strlen(scrfile) - strlen(JOB_FILE_SUFFIX);
				scrfile[baselen] = 0; /* put null char */
				strcat(scrfile, JOB_SCRIPT_SUFFIX);
				rc = 1;
#ifdef WIN32
				if ((fd = open(scrfile, O_BINARY | O_RDONLY)) != -1)
#else
				if ((fd = open(scrfile, O_RDONLY)) != -1)
#endif
				{
					/* load the script */
					if (fstat(fd, &stbuf) == 0) {
						if ((scrbuf = malloc(stbuf.st_size + 1))) {
							if (read(fd, scrbuf, stbuf.st_size) == stbuf.st_size) {
								scrbuf[stbuf.st_size] = '\0'; /* null character */
								rc = 0; /* success loading */
							}
						}
					}
					close(fd);
				}

				if (rc != 0) {
					fprintf(stderr, "Could not recover script file for job %s\n", pjob->ji_qs.ji_jobid);
					(void) strcpy(basen, scrfile);
					psuffix = basen + strlen(scrfile) - strlen(JOB_SCRIPT_SUFFIX);
					(void) strcpy(psuffix, JOB_BAD_SUFFIX);

					(void) strcpy(jobfile, scrfile);
					psuffix = jobfile + strlen(jobfile) - strlen(JOB_SCRIPT_SUFFIX);
					(void) strcpy(psuffix, JOB_FILE_SUFFIX);
#ifdef WIN32
					if (MoveFileEx(jobfile, basen,
						MOVEFILE_REPLACE_EXISTING | MOVEFILE_WRITE_THROUGH) == 0) {
						errno = GetLastError();
						snprintf(log_buffer, sizeof(log_buffer), "MoveFileEx(%s, %s) failed!",
							jobfile, basen);
						log_err(errno, "script", log_buffer);

					}
					secure_file(basen, "Administrators",
						READS_MASK | WRITES_MASK | STANDARD_RIGHTS_REQUIRED);
#else
					if (rename(jobfile, basen) == -1) {
						snprintf(log_buffer, sizeof(log_buffer), "error renaming job file %s",
							jobfile);
						log_err(errno, "job_recov", log_buffer);
					}
#endif
					(void) snprintf(log_buffer, sizeof(log_buffer), "moved bad file to %s",
						basen);
					log_event(PBSEVENT_SYSTEM,
						PBS_EVENTCLASS_SERVER, LOG_NOTICE,
						msg_daemonname, log_buffer);
					free(scrbuf);
					scrbuf = NULL;
					continue;
				}
			}

			/* now save job first */
			if (job_save_db(pjob, SAVEJOB_NEW) != 0) {
				fprintf(stderr, "Could not save job info for jobid %s\n",
					pjob->ji_qs.ji_jobid);
				if (svr_db_conn->conn_db_err)
					fprintf(stderr, "[%s]\n", (char*)svr_db_conn->conn_db_err);
				(void) pbs_db_end_trx(svr_db_conn, PBS_DB_ROLLBACK);
				(void) closedir(dir);
				chdir(origdir);
				free(scrbuf);
				return (-1);
			}

			if (pjob->ji_qs.ji_svrflags & JOB_SVFLG_SCRIPT) {
				/* save job script */
				strcpy(jobscr.ji_jobid, pjob->ji_qs.ji_jobid);
				jobscr.script = scrbuf;
				obj.pbs_db_obj_type = PBS_DB_JOBSCR;
				obj.pbs_db_un.pbs_db_jobscr = &jobscr;
				if (pbs_db_insert_obj(svr_db_conn, &obj) != 0) {
					fprintf(stderr, "Could not save job script for jobid %s\n",
						pjob->ji_qs.ji_jobid);
					if (svr_db_conn->conn_db_err)
						fprintf(stderr, "[%s]\n", (char*)svr_db_conn->conn_db_err);
					(void) pbs_db_end_trx(svr_db_conn, PBS_DB_ROLLBACK);
					free(scrbuf);
					(void) closedir(dir);
					chdir(origdir);
					return (-1);
				}
				free(scrbuf);
				scrbuf = NULL;
			}

			recovered++;
		}
		if (errno != 0 && errno != ENOENT) {
			if (pjob)
				fprintf(stderr, "readdir error for jobid %s\n", pjob->ji_qs.ji_jobid);
			else
				fprintf(stderr, "readdir error\n");
			(void) pbs_db_end_trx(svr_db_conn, PBS_DB_ROLLBACK);
			free(scrbuf);
			(void) closedir(dir);
			chdir(origdir);
			return (-1);
		}
		(void) closedir(dir);
		if (had != recovered) {
			fprintf(stderr, msg_init_exptjobs, had, recovered);
			fprintf(stderr, "\n");
		}
	}

	if (save_nodes_db(0) != 0) {
		fprintf(stderr, "Could not save nodes\n");
		if (svr_db_conn->conn_db_err)
			fprintf(stderr, "[%s]\n", (char*)svr_db_conn->conn_db_err);
		(void) pbs_db_end_trx(svr_db_conn, PBS_DB_ROLLBACK);
		chdir(origdir);
		return (-1);
	}

	if (pbs_db_end_trx(svr_db_conn, PBS_DB_COMMIT) == 0) {
		rm_migrated_files(path_priv);
		chdir(origdir);
		return (0);
	}
	chdir(origdir);
	return -1;
}
Exemplo n.º 4
0
void
svr_shutdown(int type)
{
	attribute	  *pattr;
	job		  *pjob;
	job		  *pnxt;
	long		 *state;
	int		  wait_for_secondary = 0;

	/* Lets start by logging shutdown and saving everything */

	state = &server.sv_attr[(int)SRV_ATR_State].at_val.at_long;
	(void)strcpy(log_buffer, msg_shutdown_start);

	if (*state == SV_STATE_SHUTIMM) {

		/* if already shuting down, another Immed/sig will force it */

		if ((type == SHUT_IMMEDIATE) || (type == SHUT_SIG)) {
			*state = SV_STATE_DOWN;
			(void)strcat(log_buffer, "Forced");
			log_event(PBSEVENT_SYSTEM|PBSEVENT_ADMIN|PBSEVENT_DEBUG,
				PBS_EVENTCLASS_SERVER, LOG_NOTICE,
				msg_daemonname, log_buffer);
			return;
		}
	}

	/* in failover environments, need to communicate with Secondary */
	/* and for these two where the Primary is going down, mark to   */
	/* wait for the acknowledgement from the Secondary              */

	if (type & SHUT_WHO_SECDRY) {
		if (failover_send_shutdown(FAILOVER_SecdShutdown) == 0)
			wait_for_secondary = 1;
	} else if (type & SHUT_WHO_IDLESECDRY) {
		if (failover_send_shutdown(FAILOVER_SecdGoInactive) == 0)
			wait_for_secondary = 1;
	}

	/* what is the manner of our demise? */

	type = type & SHUT_MASK;
	if (type == SHUT_IMMEDIATE) {
		*state = SV_STATE_SHUTIMM;
		(void)strcat(log_buffer, "Immediate");

	} else if (type == SHUT_DELAY) {
		*state = SV_STATE_SHUTDEL;
		(void)strcat(log_buffer, "Delayed");

	} else if (type == SHUT_QUICK) {
		*state = SV_STATE_DOWN; /* set to down to brk pbsd_main loop */
		(void)strcat(log_buffer, "Quick");

	} else {
		*state = SV_STATE_DOWN;
		(void)strcat(log_buffer, "By Signal");
		type = SHUT_QUICK;
	}
	log_event(PBSEVENT_SYSTEM|PBSEVENT_ADMIN|PBSEVENT_DEBUG,
		PBS_EVENTCLASS_SERVER, LOG_NOTICE, msg_daemonname, log_buffer);

	if (wait_for_secondary)
		*state |= SV_STATE_PRIMDLY; /* wait for reply from Secondary */

	if (type == SHUT_QUICK) /* quick, leave jobs as are */
		return;

	svr_save_db(&server, SVR_SAVE_QUICK);

	pnxt = (job *)GET_NEXT(svr_alljobs);
	while ((pjob = pnxt) != (job *)0) {
		pnxt = (job *)GET_NEXT(pjob->ji_alljobs);

		if (pjob->ji_qs.ji_state == JOB_STATE_RUNNING) {

			pjob->ji_qs.ji_svrflags |= JOB_SVFLG_HOTSTART;
			pjob->ji_qs.ji_svrflags |= JOB_SVFLG_HASRUN;
			pattr = &pjob->ji_wattr[(int)JOB_ATR_chkpnt];
			if ((pattr->at_val.at_str) &&
				(*pattr->at_val.at_str != 'n')) {
				/* do checkpoint of job */

				if (shutdown_chkpt(pjob) == 0)
					continue;
			}

			/* if not checkpoint (not supported, not allowed, or fails */
			/* rerun if possible, else kill job			   */

			rerun_or_kill(pjob, msg_on_shutdown);
		}
	}
	return;
}