Exemplo n.º 1
0
/**
 * @brief	This is the Unix couterpart of the monitoring
 *			code.
 * @par
 *		This function does the following:
 *		a) Creates a pipe, forks itself, parent waits to read on pipe.
 *		b) Child creates/opens a file $PBS_HOME/datastore/pbs_dblock.
 *		c) Attempts to lock the file. If locking
 *			succeeds, unlocks the file and writes 0 (success) to the write
 *			end of the pipe. If locking fails, writes 1 (failure) to pipe.
 *			Parent reads from pipe and exits with the code read from pipe.
 *		d) If mode is "check" then child quits.
 *		e) If mode is "monitor", continues in the background, checking
 *			the database pid in a loop forever. If database pid goes
 *			down, then it unlocks the file and exits.
 *
 * @param[in]	mode	-	"check" - to just check if lockfile can be locked
 *		     				"monitor" - to launch a monitoring child process that
 *							holds onto the file lock.
 *
 * @retval	1	-	Function failed to acquire lock
 * @retval	0	-	Function succeded in the requested operation
 * @par
 * 		The return values are not used by the caller (parent process) since
 * 		in the success case this function does not return. Instead, the parent
 * 		waits on the read end of the pipe to read a status from the monitoring
 * 		child process.
 *
 * @par MT-safe: Yes
 */
int
unix_db_monitor(char *mode)
{
	int fd;
	int rc;
	int i;
	pid_t dbpid;
	char lockfile[MAXPATHLEN + 1];
	int pipefd[2];
	int res;
	int is_lock_local = 0;
	char reason[RES_BUF_SIZE];

	reason[0] = '\0';

	if (pipe(pipefd) != 0) {
		fprintf(stderr, "Unable to create pipe, errno = %d\n", errno);
		return 1;
	}

	snprintf(lockfile, MAXPATHLEN, "%s/datastore/pbs_dblock", pbs_conf.pbs_home_path);

	/* first fork off */
	rc = fork();
	if (rc == -1) {
		fprintf(stderr, "Unable to create process, errno = %d\n", errno);
		return 1;
	}

	if (rc > 0) {
		close(pipefd[1]);
		/*
		 * child can continue to execute in case of "monitor",
		 * so dont wait for child to exit, rather read code
		 * from pipe that child will write to
		 */
		if (read(pipefd[0], &res, sizeof(int)) != sizeof(int))
			return 1;

		if (res != 0) {
			read(pipefd[0], &reason, sizeof(reason));
			fprintf(stderr, "Failed to acquire lock on %s. %s\n", lockfile, reason);
		}

		return (res); /* return parent with success */
	}

	close(pipefd[0]);

	/* child */
	if (setsid() == -1) {
		close(pipefd[1]);
		return 1;
	}

	(void)fclose(stdin);
	(void)fclose(stdout);
	(void)fclose(stderr);

	/* Protect from being killed by kernel */
	daemon_protect(0, PBS_DAEMON_PROTECT_ON);

	if ((fd = acquire_lock(lockfile, reason, sizeof(reason), &is_lock_local)) == -1) {
		if (is_lock_local && strcmp(mode, "check") == 0) {
			/* write success to parent since lock is already held by the localhost */
			res = 0;
			write(pipefd[1], &res, sizeof(int));
			close(pipefd[1]);
			return 0;
		}
		res = 1;
		write(pipefd[1], &res, sizeof(int));
		write(pipefd[1], reason, sizeof(reason));
		close(pipefd[1]);
		return 1;
	}

	/* unlock before writing success to parent, to avoid race */
	if (strcmp(mode, "check") == 0) {
		lock_out(fd, F_UNLCK);
		close(fd);
		unlink(lockfile);
	}

	/* write success to parent since we acquired the lock */
	res = 0;
	write(pipefd[1], &res, sizeof(int));
	close(pipefd[1]);

	if (strcmp(mode, "check") == 0)
		return 0;

	/* clear any residual stop db file before starting monitoring */
	clear_stop_db_file();

	/*
	 * first find out the pid of the postgres process from dbstore/postmaster.pid
	 * wait for a while till it is found
	 * if not found within MAX_DBPID_ATTEMPTS then break with error
	 * if found, start monitoring the pid
	 *
	 */
	dbpid = 0;
	for (i = 0; i < MAX_DBPID_ATTEMPTS; i++) {
		if ((dbpid = get_pid()) > 0)
			break;
		(void)utimes(lockfile, NULL);
		sleep(1);
	}

	if (dbpid == 0) {
		/* database did not come up, so quit after unlocking file */
		lock_out(fd, F_UNLCK);
		close(fd);
		unlink(lockfile);
		return 0;
	}

	while (1) {
		(void)utimes(lockfile, NULL);

		if (kill(dbpid, 0) != 0)
			break;
		if (!((dbpid = get_pid()) > 0))
			break;

		/* check if stop db file exists */
		check_and_stop_db(dbpid);

		sleep(1);
	}

	lock_out(fd, F_UNLCK);
	close(fd);
	unlink(lockfile);

	return 0;
}
Exemplo n.º 2
0
/**
 *
 * @brief
 * 		Send a job over the network to some other server or MOM.
 * @par
 * 		Under Linux/Unix, this starts a child process to do the work.
 *		Connect to the destination host and port,
 * 		and go through the protocol to transfer the job.
 * 		Signals are blocked.
 *
 * @param[in]	jobp	-	pointer to the job being sent.
 * @param[in]	hostaddr	-	the address of host to send job to, host byte order.
 * @param[in]	port	-	the destination port, host byte order
 * @param[in]	move_type	-	the type of move (e.g. MOVE_TYPE_exec)
 * @param[in]	post_func	-	the function to execute once the child process
 *								sending job completes (Linux/Unix only)
 * @param[in]	data	-	input data to 'post_func'
 *
 * @return	int
 * @retval	2	parent	: success (child forked)
 * @retval	-1	parent	: on failure (pbs_errno set to error number)
 * @retval	SEND_JOB_OK	child	: 0 success, job sent
 * @retval	SEND_JOB_FATAL	child	: 1 permenent failure or rejection,
 * @retval	SEND_JOB_RETRY	child	: 2 failed but try again
 * @retval	SEND_JOB_NODEDW child	: 3 execution node down, retry different node
 */
int
send_job(job *jobp, pbs_net_t hostaddr, int port, int move_type,
	void (*post_func)(struct work_task *), struct batch_request *preq)
{

#ifdef WIN32
	char	cmdline[80];
	pio_handles	pio;
	char	buf[4096];
	struct work_task *ptask;
	int	newstate;
	int	newsub;
	long	tempval;
	char	script_name[MAXPATHLEN+1];
	int 		gridproxy_cred = 0;

#ifdef  PBS_CRED_GRIDPROXY
	if (jobp->ji_extended.ji_ext.ji_credtype == PBS_CREDTYPE_GRIDPROXY)
		gridproxy_cred = 1;
#endif

	if (pbs_conf.pbs_use_tcp == 1 && move_type == MOVE_TYPE_Exec && gridproxy_cred == 0) {
		return (send_job_exec(jobp, hostaddr, port, preq));
	}

	sprintf(cmdline, "%s/sbin/pbs_send_job", pbs_conf.pbs_exec_path);

	if (win_popen(cmdline, "w", &pio, NULL) == 0) {
		errno = GetLastError();
		pbs_errno = errno;
		(void)sprintf(log_buffer, "executing %s for job %s failed errno=%d", cmdline, jobp->ji_qs.ji_jobid, errno);
		log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, LOG_ERR,
			jobp->ji_qs.ji_jobid, log_buffer);
		/* force re-eval of job state out of Transit */
		svr_evaljobstate(jobp, &newstate, &newsub, 1);
		svr_setjobstate(jobp, newstate, newsub);

		win_pclose(&pio);
		return (-1);
	}

	ptask = set_task(WORK_Deferred_Child, (long)pio.pi.hProcess, post_func, preq);
	if (!ptask) {
		log_err(errno, __func__, msg_err_malloc);
		errno = ENOMEM;
		pbs_errno = errno;
		win_pclose(&pio);
		/* force re-eval of job state out of Transit */
		svr_evaljobstate(jobp, &newstate, &newsub, 1);
		svr_setjobstate(jobp, newstate, newsub);
		return (-1);
	} else {
		ptask->wt_parm2 = jobp;
		append_link(&((job *)jobp)->ji_svrtask, &ptask->wt_linkobj, ptask);
	}

	script_name[0] = '\0';
	/* if job has a script read it from database */
	if (jobp->ji_qs.ji_svrflags & JOB_SVFLG_SCRIPT) {
		/*
		 * copy the job script from database to a temp file
		 * PBSD_jscript works with a file
		 * delete it at the end of the send
		 */
		if (svr_create_tmp_jobscript(jobp, &script_name) != 0) {
			pbs_errno = PBSE_SYSTEM;
			snprintf(log_buffer, sizeof(log_buffer),
				"Failed to create temporary job script for job %s",
				jobp->ji_qs.ji_jobid);
			log_err(pbs_errno, "send_job", log_buffer);
			win_pclose2(&pio);
			return (-1);
		}
	}

	addpid(pio.pi.hProcess);

	/* our job is to calc eligible time accurately and save it */
	/* on new server, accrue type should be calc afresh */
	/* Note: if job is being sent for execution on mom, then don't calc eligible time */

	if ((jobp->ji_wattr[(int)JOB_ATR_accrue_type].at_val.at_long == JOB_ELIGIBLE) &&
		(server.sv_attr[(int)SRV_ATR_EligibleTimeEnable].at_val.at_long == 1) &&
		(move_type != MOVE_TYPE_Exec)) {
		tempval = ((long)time_now - jobp->ji_wattr[(int)JOB_ATR_sample_starttime].at_val.at_long);
		jobp->ji_wattr[(int)JOB_ATR_eligible_time].at_val.at_long += tempval;
		jobp->ji_wattr[(int)JOB_ATR_eligible_time].at_flags |= ATR_VFLAG_MODCACHE;
	}

	/* in windows code, a child process "w32_send_job" handles the send
	 * This needs the job information, so we save using the filesystem
	 * This avoids the child process from having to "connect" to the database again
	 * The file is deleted by the send_job child process when it has done recovering the job
	 */
	job_save_fs(jobp, SAVEJOB_FULLFORCE);	/* so the spawned process can get a fresh copy of job */

	if (*jobp->ji_qs.ji_fileprefix != '\0')
		sprintf(buf, "jobfile=%s%s\n", jobp->ji_qs.ji_fileprefix, JOB_FILE_SUFFIX);
	else
		sprintf(buf, "jobfile=%s%s\n", jobp->ji_qs.ji_jobid, JOB_FILE_SUFFIX);

	win_pwrite(&pio, buf, strlen(buf));

	sprintf(buf, "destaddr=%ld\n", hostaddr);
	win_pwrite(&pio, buf, strlen(buf));

	sprintf(buf, "destport=%d\n", port);
	win_pwrite(&pio, buf, strlen(buf));

	sprintf(buf, "move_type=%d\n", move_type);
	win_pwrite(&pio, buf, strlen(buf));

	sprintf(buf, "in_server=%d\n", is_linked(&svr_alljobs, &jobp->ji_alljobs));
	win_pwrite(&pio, buf, strlen(buf));

	sprintf(buf, "server_name=%s\n", (server_name?server_name:""));
	win_pwrite(&pio, buf, strlen(buf));

	sprintf(buf, "server_host=%s\n", (server_host?server_host:""));
	win_pwrite(&pio, buf, strlen(buf));

	sprintf(buf, "server_addr=%ld\n", pbs_server_addr);
	win_pwrite(&pio, buf, strlen(buf));

	sprintf(buf, "server_port=%d\n", pbs_server_port_dis);
	win_pwrite(&pio, buf, strlen(buf));

	sprintf(buf, "log_file=%s\n", (log_file?log_file:""));
	win_pwrite(&pio, buf, strlen(buf));

	sprintf(buf, "path_log=%s\n", (path_log?path_log:""));
	win_pwrite(&pio, buf, strlen(buf));

	sprintf(buf, "path_jobs=%s\n", (path_jobs?path_jobs:""));
	win_pwrite(&pio, buf, strlen(buf));

	sprintf(buf, "path_spool=%s\n", (path_spool?path_spool:""));
	win_pwrite(&pio, buf, strlen(buf));

	sprintf(buf, "path_rescdef=%s\n", (path_rescdef?path_rescdef:""));
	win_pwrite(&pio, buf, strlen(buf));

	sprintf(buf, "path_users=%s\n", (path_users?path_users:""));
	win_pwrite(&pio, buf, strlen(buf));

	sprintf(buf, "path_hooks_workdir=%s\n",
		(path_hooks_workdir?path_hooks_workdir:""));
	win_pwrite(&pio, buf, strlen(buf));

	sprintf(buf, "svr_history_enable=%ld\n", svr_history_enable);
	win_pwrite(&pio, buf, strlen(buf));

	sprintf(buf, "svr_history_duration=%ld\n", svr_history_duration);
	win_pwrite(&pio, buf, strlen(buf));

	if ( (server.sv_attr[SRV_ATR_ssignon_enable].at_flags & \
                                                ATR_VFLAG_SET) && \
             (server.sv_attr[SRV_ATR_ssignon_enable].at_val.at_long == 1) )
		strcpy(buf, "single_signon_password_enable=1\n");
	else
		strcpy(buf, "single_signon_password_enable=0\n");

	win_pwrite(&pio, buf, strlen(buf));

	sprintf(buf, "script_name=%s\n", script_name);
	win_pwrite(&pio, buf, strlen(buf));

	strcpy(buf, "quit\n");
	win_pwrite(&pio, buf, strlen(buf));
	win_pclose2(&pio);	/* closes all handles except the process handle */
	return (2);
#else
	pbs_list_head	 attrl;
	enum conn_type   cntype = ToServerDIS;
	int		 con;
	char		*credbuf = NULL;
	size_t		 credlen = 0;
	char		*destin = jobp->ji_qs.ji_destin;
	int		 encode_type;
	int		 i;
	char		 job_id[PBS_MAXSVRJOBID+1];
	attribute	*pattr;
	pid_t		 pid;
	struct attropl  *pqjatr;      /* list (single) of attropl for quejob */
	char		 script_name[MAXPATHLEN+1];
	struct work_task *ptask;
	struct  hostent *hp;
	struct in_addr   addr;
	long		 tempval;
	int 		gridproxy_cred = 0;
	int 		rpp = 0;

#ifdef  PBS_CRED_GRIDPROXY
	if (jobp->ji_extended.ji_ext.ji_credtype == PBS_CREDTYPE_GRIDPROXY)
		gridproxy_cred = 1;
#endif

	if (pbs_conf.pbs_use_tcp == 1 && move_type == MOVE_TYPE_Exec && gridproxy_cred == 0) {
		return (send_job_exec(jobp, hostaddr, port, preq));
	}

	script_name[0] = '\0';
	/* if job has a script read it from database */
	if (jobp->ji_qs.ji_svrflags & JOB_SVFLG_SCRIPT) {
		/*
		 * copy the job script from database to a temp file
		 * PBSD_jscript works with a file
		 * delete it at the end of the send
		 */
		if (svr_create_tmp_jobscript(jobp, script_name) != 0) {
			pbs_errno = PBSE_SYSTEM;
			snprintf(log_buffer, sizeof(log_buffer),
				"Failed to create temporary job script for job %s",
				jobp->ji_qs.ji_jobid);
			log_err(pbs_errno, "send_job", log_buffer);
			return -1;
		}
	}

	pid = fork();
	if (pid == -1) {	/* Error on fork */
		log_err(errno, __func__, "fork failed\n");
		pbs_errno = PBSE_SYSTEM;
		return -1;
	}

	if (pid != 0) {		/* The parent (main server) */

		ptask = set_task(WORK_Deferred_Child, pid, post_func, preq);
		if (!ptask) {
			log_err(errno, __func__, msg_err_malloc);
			return (-1);
		} else {
			ptask->wt_parm2 = jobp;
			append_link(&((job *)jobp)->ji_svrtask,
				&ptask->wt_linkobj, ptask);
		}
		return 2;
	}

	/*
	 * the child process
	 *
	 * set up signal cather for error return
	 */
	DBPRT(("%s: child started, sending to port %d\n", __func__, port))
	rpp_terminate();

	/* Unprotect child from being killed by kernel */
	daemon_protect(0, PBS_DAEMON_PROTECT_OFF);

#ifdef WIN32
	/* get host name */
	/*
	 * If host address is loopback address then do not resolve with dns
	 * Use "localhost" as the host name.
	 */
	if ((htonl(hostaddr) == loopback_addr->sin_addr.s_addr)) {
		(void)get_credential(LOCALHOST_SHORTNAME, jobp, PBS_GC_BATREQ,
			&credbuf, &credlen);
	} else {
#endif
		addr.s_addr = htonl(hostaddr);
		hp = gethostbyaddr((void *)&addr, sizeof(struct in_addr), AF_INET);
		if (hp == NULL) {
			sprintf(log_buffer, "%s: h_errno=%d",
				inet_ntoa(addr), h_errno);
			log_err(-1, __func__, log_buffer);
		} else {
			/* read any credential file */
			(void)get_credential(hp->h_name, jobp, PBS_GC_BATREQ,
				&credbuf, &credlen);
		}
#ifdef WIN32
	}
#endif

	/* encode job attributes to be moved */

	CLEAR_HEAD(attrl);

	/* select attributes/resources to send based on move type */

	if (move_type == MOVE_TYPE_Exec) {
		resc_access_perm = ATR_DFLAG_MOM;
		encode_type = ATR_ENCODE_MOM;
		cntype = ToServerDIS;
	} else {
		resc_access_perm = ATR_DFLAG_USWR | ATR_DFLAG_OPWR |
			ATR_DFLAG_MGWR | ATR_DFLAG_SvRD;
		encode_type = ATR_ENCODE_SVR;
		svr_dequejob(jobp);	/* clears default resource settings */
	}

	/* our job is to calc eligible time accurately and save it */
	/* on new server, accrue type should be calc afresh */
	/* Note: if job is being sent for execution on mom, then don't calc eligible time */

	if ((jobp->ji_wattr[(int)JOB_ATR_accrue_type].at_val.at_long == JOB_ELIGIBLE) &&
		(server.sv_attr[(int)SRV_ATR_EligibleTimeEnable].at_val.at_long == 1) &&
		(move_type != MOVE_TYPE_Exec)) {
		tempval = ((long)time_now - jobp->ji_wattr[(int)JOB_ATR_sample_starttime].at_val.at_long);
		jobp->ji_wattr[(int)JOB_ATR_eligible_time].at_val.at_long += tempval;
		jobp->ji_wattr[(int)JOB_ATR_eligible_time].at_flags |= ATR_VFLAG_MODCACHE;
	}

	pattr = jobp->ji_wattr;
	for (i=0; i < (int)JOB_ATR_LAST; i++) {
		if ((job_attr_def+i)->at_flags & resc_access_perm) {
			(void)(job_attr_def+i)->at_encode(pattr+i, &attrl,
				(job_attr_def+i)->at_name, (char *)0,
				encode_type, NULL);
		}
	}
	attrl_fixlink(&attrl);


	/* save the job id for when after we purge the job */

	(void)strcpy(job_id, jobp->ji_qs.ji_jobid);

	pbs_errno = 0;
	con = -1;

	for (i=0; i<RETRY; i++) {

		/* connect to receiving server with retries */

		if (i > 0) {	/* recycle after an error */
			if (con >= 0)
				svr_disconnect(con);
			if (should_retry_route(pbs_errno) == -1) {
				/* delete the temp script file */
				unlink(script_name);
				exit(SEND_JOB_FATAL);	/* fatal error, don't retry */
			}
			sleep(1<<i);
		}
		if ((con = svr_connect(hostaddr, port, 0, cntype, rpp)) ==
			PBS_NET_RC_FATAL) {
			(void)sprintf(log_buffer, "send_job failed to %lx port %d",
				hostaddr, port);
			log_err(pbs_errno, __func__, log_buffer);

			/* delete the temp script file */
			unlink(script_name);

			if ((move_type == MOVE_TYPE_Exec) && (pbs_errno == PBSE_BADCRED))
				exit(SEND_JOB_NODEDW);

			exit(SEND_JOB_FATAL);
		} else if (con == PBS_NET_RC_RETRY) {
			pbs_errno = ECONNREFUSED;	/* should retry */
			continue;
		}

		/*
		 * if the job is substate JOB_SUBSTATE_TRNOUTCM which means
		 * we are recovering after being down or a late failure, we
		 * just want to send the commit"
		 */

		if (jobp->ji_qs.ji_substate != JOB_SUBSTATE_TRNOUTCM) {

			if (jobp->ji_qs.ji_substate != JOB_SUBSTATE_TRNOUT) {
				jobp->ji_qs.ji_substate = JOB_SUBSTATE_TRNOUT;
			}

			pqjatr = &((svrattrl *)GET_NEXT(attrl))->al_atopl;
			if (PBSD_queuejob(con, jobp->ji_qs.ji_jobid, destin,
				pqjatr, (char *)0, rpp, NULL) == 0) {
				if (pbs_errno == PBSE_JOBEXIST &&
					move_type == MOVE_TYPE_Exec) {
					/* already running, mark it so */
					log_event(PBSEVENT_ERROR, PBS_EVENTCLASS_JOB,
						LOG_INFO, jobp->ji_qs.ji_jobid,
						"Mom reports job already running");
					exit(SEND_JOB_OK);
				}
				else if ((pbs_errno == PBSE_HOOKERROR) ||
					(pbs_errno == PBSE_HOOK_REJECT)  ||
					(pbs_errno == PBSE_HOOK_REJECT_RERUNJOB)  ||
					(pbs_errno == PBSE_HOOK_REJECT_DELETEJOB)) {
					char		name_buf[MAXPATHLEN+1];
					int		rfd;
					int		len;
					char		*reject_msg;
					int		err;

					err = pbs_errno;

					reject_msg = pbs_geterrmsg(con);
					(void)sprintf(log_buffer,
						"send of job to %s failed error = %d reject_msg=%s",
						destin, err,
						reject_msg?reject_msg:"");
					log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB,
						LOG_INFO, jobp->ji_qs.ji_jobid,
						log_buffer);

					(void)strcpy(name_buf, path_hooks_workdir);
					(void)strcat(name_buf, jobp->ji_qs.ji_jobid);
					(void)strcat(name_buf, HOOK_REJECT_SUFFIX);

					if ((reject_msg != NULL) &&
						(reject_msg[0] != '\0')) {

						if ((rfd = open(name_buf,
							O_RDWR|O_CREAT|O_TRUNC, 0600)) == -1) {
							sprintf(log_buffer,
								"open of reject file %s failed: errno %d",
								name_buf, errno);
							log_event(PBSEVENT_JOB,
								PBS_EVENTCLASS_JOB,
								LOG_INFO, jobp->ji_qs.ji_jobid,
								log_buffer);
						} else {
#ifdef WIN32
							secure_file(name_buf, "Administrators",
								READS_MASK|WRITES_MASK|STANDARD_RIGHTS_REQUIRED);
							setmode(rfd, O_BINARY);
#endif
							len = strlen(reject_msg)+1;
							/* write also trailing null char */
							if (write(rfd, reject_msg, len) != len) {
								sprintf(log_buffer,
									"write to file %s incomplete: errno %d", name_buf, errno);
								log_event(PBSEVENT_JOB,
									PBS_EVENTCLASS_JOB,
									LOG_INFO, jobp->ji_qs.ji_jobid,
									log_buffer);
							}
							close(rfd);
						}
					}

					if (err == PBSE_HOOKERROR)
						exit(SEND_JOB_HOOKERR);
					if (err == PBSE_HOOK_REJECT)
						exit(SEND_JOB_HOOK_REJECT);
					if (err == PBSE_HOOK_REJECT_RERUNJOB)
						exit(SEND_JOB_HOOK_REJECT_RERUNJOB);
					if (err == PBSE_HOOK_REJECT_DELETEJOB)
						exit(SEND_JOB_HOOK_REJECT_DELETEJOB);
				}
				else {
					(void)sprintf(log_buffer,
						"send of job to %s failed error = %d",
						destin, pbs_errno);
					log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB,
						LOG_INFO, jobp->ji_qs.ji_jobid,
						log_buffer);
					continue;
				}
			}

			if (jobp->ji_qs.ji_svrflags & JOB_SVFLG_SCRIPT) {
				if (PBSD_jscript(con, script_name, rpp, NULL) != 0)
					continue;
			}

			if (credlen > 0) {
				int	ret;

				ret = PBSD_jcred(con,
					jobp->ji_extended.ji_ext.ji_credtype,
					credbuf, credlen, rpp, NULL);
				if ((ret == 0) || (i == (RETRY - 1)))
					free(credbuf);	/* free credbuf if cred info is sent successfully OR */
				/* at the end of all retry attempts */
				if (ret != 0)
					continue;
			}

			if ((move_type == MOVE_TYPE_Exec) &&
				(jobp->ji_qs.ji_svrflags & JOB_SVFLG_HASRUN) &&
				(hostaddr !=  pbs_server_addr)) {
				/* send files created on prior run */
				if ((move_job_file(con, jobp, StdOut, rpp, NULL) != 0) ||
					(move_job_file(con, jobp, StdErr, rpp, NULL) != 0) ||
					(move_job_file(con, jobp, Chkpt, rpp, NULL) != 0))
					continue;
			}

			jobp->ji_qs.ji_substate = JOB_SUBSTATE_TRNOUTCM;
		}

		if (PBSD_rdytocmt(con, job_id, rpp, NULL) != 0)
			continue;

		if (PBSD_commit(con, job_id, rpp, NULL) != 0) {
			/* delete the temp script file */
			unlink(script_name);
			exit(SEND_JOB_FATAL);
		}
		svr_disconnect(con);

		/* delete the temp script file */
		unlink(script_name);

		exit(SEND_JOB_OK);	/* This child process is all done */
	}
	if (con >= 0)
		svr_disconnect(con);
	/*
	 * If connection is actively refused by the execution node(or mother superior) OR
	 * the execution node(or mother superior) is rejecting request with error
	 * PBSE_BADHOST(failing to authorize server host), the node should be marked down.
	 */
	if ((move_type == MOVE_TYPE_Exec) && (pbs_errno == ECONNREFUSED  || pbs_errno == PBSE_BADHOST)) {
		i = SEND_JOB_NODEDW;
	} else if (should_retry_route(pbs_errno) == -1) {
		i = SEND_JOB_FATAL;
	} else {
		i = SEND_JOB_RETRY;
	}
	(void)sprintf(log_buffer, "send_job failed with error %d", pbs_errno);
	log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, LOG_NOTICE,
		jobp->ji_qs.ji_jobid, log_buffer);

	/* delete the temp script file */
	unlink(script_name);

	exit(i);
	return -1;		/* NOT REACHED */

#endif /* !WIN32 */
}
Exemplo n.º 3
0
/**
 * @brief
 * 		Send mail to owner of a reservation when an event happens that
 *		requires mail, such as the reservation starts, ends or is aborted.
 *		The event is matched against those requested by the user.
 *		For Unix/Linux, a child is forked to not hold up the Server.  This child
 *		will fork/exec sendmail and pipe the To, Subject and body to it.
 *
 * @param[in]	presv	-	pointer to the reservation structure
 * @param[in]	mailpoint	-	which mail event is triggering the send
 * @param[in]	force	-	if non-zero, force the mail even if not requested
 * @param[in]	text	-	the body text of the mail message
 *
 * @return	none
 */
void
svr_mailownerResv(resc_resv *presv, int mailpoint, int force, char *text)
{
	int	 i;
	int	 addmailhost;
	char	*mailfrom;
	char	 mailto[MAIL_ADDR_BUF_LEN];
	int	 mailaddrlen = 0;
	struct array_strings *pas;
	char	*pat;
	char	*stdmessage = NULL;
#ifndef WIN32
	FILE	*outmail;
	char	*margs[5];
	int	 mfds[2];
	pid_t	 mcpid;
#endif

	if (force != MAIL_FORCE) {
		/*Not forcing out mail regardless of mailpoint */

		if (presv->ri_wattr[(int)RESV_ATR_mailpnts].at_flags &ATR_VFLAG_SET) {
			/*user has set one or mode mailpoints is this one included?*/
			if (strchr(presv->ri_wattr[(int)RESV_ATR_mailpnts].at_val.at_str,
				mailpoint) == NULL)
				return;
		} else {
			/*user hasn't bothered to set any mailpoints so default to
			 *sending mail only in the case of reservation deletion and
			 *reservation confirmation
			 */
			if ((mailpoint != MAIL_ABORT) && (mailpoint != MAIL_CONFIRM))
				return;
		}
	}

	if (presv->ri_wattr[(int)RESV_ATR_mailpnts].at_flags &ATR_VFLAG_SET) {
		if (strchr(presv->ri_wattr[(int)RESV_ATR_mailpnts].at_val.at_str,
			MAIL_NONE) != NULL)
			return;
	}

	/*
	 * ok, now we will fork a process to do the mailing to not
	 * hold up the server's other work.
	 */

#ifndef WIN32
	mcpid = fork();
	if (mcpid == -1) { /* Error on fork */
		log_err(errno, __func__, "fork failed\n");
		return;
	}
	if (mcpid > 0)
		return;		/* its all up to the child now */

	/*
	 * From here on, we are a child process of the server.
	 * Fix up file descriptors and signal handlers.
	 */

	net_close(-1);
	rpp_terminate();

	/* Unprotect child from being killed by kernel */
	daemon_protect(0, PBS_DAEMON_PROTECT_OFF);

#endif	/* ! WIN32 */

	/* Who is mail from, if SVR_ATR_mailfrom not set use default */

	if ((mailfrom = server.sv_attr[(int)SRV_ATR_mailfrom].at_val.at_str)==0)
		mailfrom = PBS_DEFAULT_MAIL;

	/* Who does the mail go to?  If mail-list, them; else owner */

	*mailto = '\0';
	if (presv->ri_wattr[(int)RESV_ATR_mailuser].at_flags & ATR_VFLAG_SET) {

		/* has mail user list, send to them rather than owner */

		pas = presv->ri_wattr[(int)RESV_ATR_mailuser].at_val.at_arst;
		if (pas != NULL) {
			for (i = 0; i < pas->as_usedptr; i++) {
				addmailhost = 0;
				mailaddrlen += strlen(pas->as_string[i]) + 2;
				if ((pbs_conf.pbs_mail_host_name)  &&
				    (strchr(pas->as_string[i], (int)'@') == NULL)) {
						/* no host specified in address and      */
						/* pbs_mail_host_name is defined, use it */
						mailaddrlen += strlen(pbs_conf.pbs_mail_host_name) + 1;
						addmailhost = 1;
				}
				if (mailaddrlen < sizeof(mailto)) {
					(void)strcat(mailto, pas->as_string[i]);
					if (addmailhost) {
						/* append pbs_mail_host_name */
						(void)strcat(mailto, "@");
						(void)strcat(mailto, pbs_conf.pbs_mail_host_name);
					} else {
					  	sprintf(log_buffer,"Email list is too long: \"%.77s...\"", mailto);
						log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, LOG_WARNING, presv->ri_qs.ri_resvID, log_buffer);
						break;
					}
					(void)strcat(mailto, " ");
				}
			}
		}

	} else {

		/* no mail user list, just send to owner */

		(void)strncpy(mailto, presv->ri_wattr[(int)RESV_ATR_resv_owner].at_val.at_str, sizeof(mailto));
		mailto[(sizeof(mailto) - 1)] = '\0';
		/* if pbs_mail_host_name is set in pbs.conf, then replace the */
		/* host name with the name specified in pbs_mail_host_name    */
		if (pbs_conf.pbs_mail_host_name) {
			if ((pat = strchr(mailto, (int)'@')) != NULL)
				*pat = '\0';	/* remove existing @host */
			if ((strlen(mailto) + strlen(pbs_conf.pbs_mail_host_name) + 1) < sizeof(mailto)) {
				/* append the pbs_mail_host_name since it fits */
				strcat(mailto, "@");
				strcat(mailto, pbs_conf.pbs_mail_host_name);
			} else {
				if (pat)
					*pat = '@';	/* did't fit, restore the "at" sign */
			  	sprintf(log_buffer,"Email address is too long: \"%.77s...\"", mailto);
				log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, LOG_WARNING, presv->ri_qs.ri_resvID, log_buffer);
			}
		}
	}

#ifdef WIN32
	send_mail_detach(1, mailfrom, mailto, presv->ri_qs.ri_resvID, mailpoint,
		presv->ri_wattr[(int)RESV_ATR_resv_name].at_val.at_str, text);
#else

	/* setup sendmail command line with -f from_whom */

	margs[0] = SENDMAIL_CMD;
	margs[1] = "-f";
	margs[2] = mailfrom;
	margs[3] = mailto;
	margs[4] = NULL;

	if (pipe(mfds) == -1)
		exit(1);

	mcpid = fork();
	if(mcpid == 0) {
		/* this child will be sendmail with its stdin set to the pipe */
		if (mfds[0] != 0) {
			(void)close(0);
			if (dup(mfds[0]) == -1)
				exit(1);
		}
		(void)close(1);
		(void)close(2);
		if (execv(SENDMAIL_CMD, margs) == -1)
			exit(1);
	}
	if (mcpid == -1) {/* Error on fork */
		log_err(errno, __func__, "fork failed\n");
		(void)close(mfds[0]);
		exit(1);
	}

	/* parent (not the real server though) will write body of message on pipe */
	(void)close(mfds[0]);
	outmail = fdopen(mfds[1], "w");
	if (outmail == NULL)
		exit(1);

	/* Pipe in mail headers: To: and Subject: */

	fprintf(outmail, "To: %s\n", mailto);
	fprintf(outmail, "Subject: PBS RESERVATION %s\n\n", presv->ri_qs.ri_resvID);

	/* Now pipe in "standard" message */

	switch (mailpoint) {

		case MAIL_ABORT:
			/*"Aborted by Server, Scheduler, or User "*/
			stdmessage = msg_resv_abort;
			break;

		case MAIL_BEGIN:
			/*"Reservation period starting"*/
			stdmessage = msg_resv_start;
			break;

		case MAIL_END:
			/*"Reservation terminated"*/
			stdmessage = msg_resv_end;
			break;

		case MAIL_CONFIRM:
			/*scheduler requested, "CONFIRM reservation"*/
			stdmessage = msg_resv_confirm;
			break;
	}

	fprintf(outmail, "PBS Reservation Id: %s\n", presv->ri_qs.ri_resvID);
	fprintf(outmail, "Reservation Name:   %s\n",
		presv->ri_wattr[(int)RESV_ATR_resv_name].at_val.at_str);
	if (stdmessage)
		fprintf(outmail, "%s\n", stdmessage);
	if (text != NULL)
		fprintf(outmail, "%s\n", text);
	fclose(outmail);

	exit(0);
#endif	/* ! WIN32 */
}
Exemplo n.º 4
0
int
main(int argc, char **argv)
#endif	/* WIN32 */
{
#ifdef	WIN32
	struct arg_param *p = (struct arg_param *)pv;
	int      		argc;
	char			**argv;
	SERVICE_STATUS          ss;
#endif	/* WIN32 */
	char *name = NULL;
	struct tpp_config conf;
	int rpp_fd;
	char *pc;
	int numthreads;
	char lockfile[MAXPATHLEN + 1];
	char path_log[MAXPATHLEN + 1];
	char svr_home[MAXPATHLEN + 1];
	char *log_file = 0;
	char *host;
	int port;
	char *routers = NULL;
	int c, i, rc;
	extern char *optarg;
	int	are_primary;
	int	num_var_env;
#ifndef WIN32
	struct sigaction act;
	struct sigaction oact;
#endif

#ifndef WIN32
	/*the real deal or just pbs_version and exit*/

	execution_mode(argc, argv);
#endif

	/* As a security measure and to make sure all file descriptors	*/
	/* are available to us,  close all above stderr			*/
#ifdef WIN32
	_fcloseall();
#else
	i = sysconf(_SC_OPEN_MAX);
	while (--i > 2)
		(void)close(i); /* close any file desc left open by parent */
#endif

	/* If we are not run with real and effective uid of 0, forget it */
#ifdef WIN32
	argc = p->argc;
	argv = p->argv;

	ZeroMemory(&ss, sizeof(ss));
	ss.dwCheckPoint = 0;
	ss.dwServiceType = SERVICE_WIN32_OWN_PROCESS;
	ss.dwCurrentState = g_dwCurrentState;
	ss.dwControlsAccepted = SERVICE_ACCEPT_STOP | SERVICE_ACCEPT_SHUTDOWN;
	ss.dwWaitHint = 6000;

	if (g_ssHandle != 0)
		SetServiceStatus(g_ssHandle, &ss);

	if (!isAdminPrivilege(getlogin())) {
		fprintf(stderr, "%s: Must be run by root\n", argv[0]);
		return (2);
	}

#else
	if ((getuid() != 0) || (geteuid() != 0)) {
		fprintf(stderr, "%s: Must be run by root\n", argv[0]);
		return (2);
	}
#endif	/* WIN32 */

	/* set standard umask */
#ifndef WIN32
	umask(022);
#endif

	/* load the pbs conf file */
	if (pbs_loadconf(0) == 0) {
		fprintf(stderr, "%s: Configuration error\n", argv[0]);
		return (1);
	}

	umask(022);

#ifdef	WIN32
	save_env();
#endif
	/* The following is code to reduce security risks                */
	/* start out with standard umask, system resource limit infinite */
	if ((num_var_env = setup_env(pbs_conf.pbs_environment)) == -1) {
#ifdef	WIN32
		g_dwCurrentState = SERVICE_STOPPED;
		ss.dwCurrentState = g_dwCurrentState;
		ss.dwWin32ExitCode = ERROR_INVALID_ENVIRONMENT;
		if (g_ssHandle != 0) SetServiceStatus(g_ssHandle, &ss);
		return (1);
#else
		exit(1);
#endif	/* WIN32 */
	}

#ifndef WIN32
	i = getgid();
	(void)setgroups(1, (gid_t *)&i);	/* secure suppl. groups */
#endif

	log_event_mask = &pbs_conf.pbs_comm_log_events;
	tpp_set_logmask(*log_event_mask);

#ifdef WIN32
	winsock_init();
#endif

	routers = pbs_conf.pbs_comm_routers;
	numthreads = pbs_conf.pbs_comm_threads;

	server_host[0] = '\0';
	if (pbs_conf.pbs_comm_name) {
		name = pbs_conf.pbs_comm_name;
		host = tpp_parse_hostname(name, &port);
		if (host)
			snprintf(server_host, sizeof(server_host), "%s", host);
		free(host);
		host = NULL;
	} else if (pbs_conf.pbs_leaf_name) {
		char *endp;

		snprintf(server_host, sizeof(server_host), "%s", pbs_conf.pbs_leaf_name);
		endp = strchr(server_host, ','); /* find the first name */
		if (endp)
			*endp = '\0';
		endp = strchr(server_host, ':'); /* cut out the port */
		if (endp)
			*endp = '\0';
		name = server_host;
	} else {
		if (gethostname(server_host, (sizeof(server_host) - 1)) == -1) {
#ifndef WIN32
			sprintf(log_buffer, "Could not determine my hostname, errno=%d", errno);
#else
			sprintf(log_buffer, "Could not determine my hostname, errno=%d", WSAGetLastError());
#endif
			fprintf(stderr, "%s\n", log_buffer);
			return (1);
		}
		if ((get_fullhostname(server_host, server_host, (sizeof(server_host) - 1)) == -1)) {
			sprintf(log_buffer, "Could not determine my hostname");
			fprintf(stderr, "%s\n", log_buffer);
			return (1);
		}
		name = server_host;
	}
	if (server_host[0] == '\0') {
		sprintf(log_buffer, "Could not determine server host");
		fprintf(stderr, "%s\n", log_buffer);
		return (1);
	}

	while ((c = getopt(argc, argv, "r:t:e:N")) != -1) {
		switch (c) {
			case 'e': *log_event_mask = strtol(optarg, NULL, 0);
				break;
			case 'r':
				routers = optarg;
				break;
			case 't':
				numthreads = atol(optarg);
				if (numthreads == -1) {
					usage(argv[0]);
					return (1);
				}
				break;
			case 'N':
				stalone = 1;
				break;
			default:
				usage(argv[0]);
				return (1);
		}
	}

	(void)strcpy(daemonname, "Comm@");
	(void)strcat(daemonname, name);
	if ((pc = strchr(daemonname, (int)'.')) != NULL)
		*pc = '\0';

	if(set_msgdaemonname(daemonname)) {
		fprintf(stderr, "Out of memory\n");
		return 1;
	}

	(void) snprintf(path_log, sizeof(path_log), "%s/%s", pbs_conf.pbs_home_path, PBS_COMM_LOGDIR);
#ifdef WIN32
	/*
	 * let SCM wait 10 seconds for log_open() to complete
	 * as it does network interface query which can take time
	 */

	ss.dwCheckPoint++;
	ss.dwWaitHint = 60000;
	if (g_ssHandle != 0) SetServiceStatus(g_ssHandle, &ss);
#endif
	(void) log_open(log_file, path_log);

	/* set pbs_comm's process limits */
	set_limits(); /* set_limits can call log_record, so call only after opening log file */

	/* set tcp function pointers */
	set_tpp_funcs(log_tppmsg);

	(void) snprintf(svr_home, sizeof(svr_home), "%s/%s", pbs_conf.pbs_home_path, PBS_SVR_PRIVATE);
	if (chdir(svr_home) != 0) {
		(void) sprintf(log_buffer, msg_init_chdir, svr_home);
		log_err(-1, __func__, log_buffer);
		return (1);
	}

	(void) sprintf(lockfile, "%s/%s/comm.lock", pbs_conf.pbs_home_path, PBS_SVR_PRIVATE);
	if ((are_primary = are_we_primary()) == FAILOVER_SECONDARY) {
		strcat(lockfile, ".secondary");
	} else if (are_primary == FAILOVER_CONFIG_ERROR) {
		sprintf(log_buffer, "Failover configuration error");
		log_err(-1, __func__, log_buffer);
#ifdef WIN32
		g_dwCurrentState = SERVICE_STOPPED;
		ss.dwCurrentState = g_dwCurrentState;
		ss.dwWin32ExitCode = ERROR_SERVICE_NOT_ACTIVE;
		if (g_ssHandle != 0) SetServiceStatus(g_ssHandle, &ss);
#endif
		return (3);
	}

	if ((lockfds = open(lockfile, O_CREAT | O_WRONLY, 0600)) < 0) {
		(void) sprintf(log_buffer, "pbs_comm: unable to open lock file");
		log_err(errno, __func__, log_buffer);
		return (1);
	}

	if ((host = tpp_parse_hostname(name, &port)) == NULL) {
		sprintf(log_buffer, "Out of memory parsing leaf name");
		log_err(errno, __func__, log_buffer);
		return (1);
	}

	rc = 0;
	if (pbs_conf.auth_method == AUTH_RESV_PORT) {
		rc = set_tpp_config(&pbs_conf, &conf, host, port, routers, pbs_conf.pbs_use_compression,
				TPP_AUTH_RESV_PORT, NULL, NULL);
	} else {
		/* for all non-resv-port based authentication use a callback from TPP */
		rc = set_tpp_config(&pbs_conf, &conf, host, port, routers, pbs_conf.pbs_use_compression,
				TPP_AUTH_EXTERNAL, get_ext_auth_data, validate_ext_auth_data);
	}
	if (rc == -1) {
		(void) sprintf(log_buffer, "Error setting TPP config");
		log_err(-1, __func__, log_buffer);
		return (1);
	}
	free(host);

	i = 0;
	if (conf.routers) {
		while (conf.routers[i]) {
			sprintf(log_buffer, "Router[%d]:%s", i, conf.routers[i]);
			fprintf(stdout, "%s\n", log_buffer);
			log_event(PBSEVENT_SYSTEM | PBSEVENT_FORCE, PBS_EVENTCLASS_SERVER, LOG_INFO, msg_daemonname, log_buffer);
			i++;
		}
	}

#ifndef DEBUG
#ifndef WIN32
	if (stalone != 1)
		go_to_background();
#endif
#endif


#ifdef WIN32
	ss.dwCheckPoint = 0;
	g_dwCurrentState = SERVICE_RUNNING;
	ss.dwCurrentState = g_dwCurrentState;
	if (g_ssHandle != 0) SetServiceStatus(g_ssHandle, &ss);
#endif

	if (already_forked == 0)
		lock_out(lockfds, F_WRLCK);

	/* go_to_backgroud call creates a forked process,
	 * thus print/log pid only after go_to_background()
	 * has been called
	 */
	sprintf(log_buffer, "%s ready (pid=%d), Proxy Name:%s, Threads:%d", argv[0], getpid(), conf.node_name, numthreads);
	fprintf(stdout, "%s\n", log_buffer);
	log_event(PBSEVENT_SYSTEM | PBSEVENT_FORCE, PBS_EVENTCLASS_SERVER, LOG_INFO, msg_daemonname, log_buffer);

#ifndef DEBUG
	pbs_close_stdfiles();
#endif

#ifdef WIN32
	signal(SIGINT, stop_me);
	signal(SIGTERM, stop_me);
#else
	sigemptyset(&act.sa_mask);
	act.sa_flags = 0;
	act.sa_handler = hup_me;
	if (sigaction(SIGHUP, &act, &oact) != 0) {
		log_err(errno, __func__, "sigaction for HUP");
		return (2);
	}
	act.sa_handler = stop_me;
	if (sigaction(SIGINT, &act, &oact) != 0) {
		log_err(errno, __func__, "sigaction for INT");
		return (2);
	}
	if (sigaction(SIGTERM, &act, &oact) != 0) {
		log_err(errno, __func__, "sigactin for TERM");
		return (2);
	}
	if (sigaction(SIGQUIT, &act, &oact) != 0) {
		log_err(errno, __func__, "sigactin for QUIT");
		return (2);
	}
#ifdef SIGSHUTDN
	if (sigaction(SIGSHUTDN, &act, &oact) != 0) {
		log_err(errno, __func__, "sigactin for SHUTDN");
		return (2);
	}
#endif	/* SIGSHUTDN */

	act.sa_handler = SIG_IGN;
	if (sigaction(SIGPIPE, &act, &oact) != 0) {
		log_err(errno, __func__, "sigaction for PIPE");
		return (2);
	}
	if (sigaction(SIGUSR1, &act, &oact) != 0) {
		log_err(errno, __func__, "sigaction for USR1");
		return (2);
	}
	if (sigaction(SIGUSR2, &act, &oact) != 0) {
		log_err(errno, __func__, "sigaction for USR2");
		return (2);
	}
#endif 	/* WIN32 */

	conf.node_type = TPP_ROUTER_NODE;
	conf.numthreads = numthreads;

	if ((rpp_fd = tpp_init_router(&conf)) == -1) {
		log_err(-1, __func__, "tpp init failed\n");
		return 1;
	}

	/* Protect from being killed by kernel */
	daemon_protect(0, PBS_DAEMON_PROTECT_ON);

	/* go in a while loop */
	while (get_out == 0) {

		if (hupped == 1) {
			struct pbs_config pbs_conf_bak;
			int new_logevent;

			hupped = 0; /* reset back */
			memcpy(&pbs_conf_bak, &pbs_conf, sizeof(struct pbs_config));

			if (pbs_loadconf(1) == 0) {
				log_tppmsg(LOG_CRIT, NULL, "Configuration error, ignoring");
				memcpy(&pbs_conf, &pbs_conf_bak, sizeof(struct pbs_config));
			} else {
				/* restore old pbs.conf */
				new_logevent = pbs_conf.pbs_comm_log_events;
				memcpy(&pbs_conf, &pbs_conf_bak, sizeof(struct pbs_config));
				pbs_conf.pbs_comm_log_events = new_logevent;
				log_tppmsg(LOG_INFO, NULL, "Processed SIGHUP");

				log_event_mask = &pbs_conf.pbs_comm_log_events;
				tpp_set_logmask(*log_event_mask);
			}
		}

		sleep(3);
	}

	tpp_router_shutdown();

	log_event(PBSEVENT_SYSTEM | PBSEVENT_FORCE, PBS_EVENTCLASS_SERVER, LOG_NOTICE, msg_daemonname, "Exiting");
	log_close(1);

	lock_out(lockfds, F_UNLCK);	/* unlock  */
	(void)close(lockfds);
	(void)unlink(lockfile);

	return 0;
}
Exemplo n.º 5
0
/**
 * @brief
 * 		Send mail to owner of a job when an event happens that
 *		requires mail, such as the job starts, ends or is aborted.
 *		The event is matched against those requested by the user.
 *		For Unix/Linux, a child is forked to not hold up the Server.  This child
 *		will fork/exec sendmail and pipe the To, Subject and body to it.
 *
 * @param[in]	jid	-	the Job ID (string)
 * @param[in]	pjob	-	pointer to the job structure
 * @param[in]	mailpoint	-	which mail event is triggering the send
 * @param[in]	force	-	if non-zero, force the mail even if not requested
 * @param[in]	text	-	the body text of the mail message
 *
 * @return	none
 */
void
svr_mailowner_id(char *jid, job *pjob, int mailpoint, int force, char *text)
{
	int	 addmailhost;
	int	 i;
	char	*mailfrom;
	char	 mailto[MAIL_ADDR_BUF_LEN];
	int	 mailaddrlen = 0;
	struct array_strings *pas;
	char	*stdmessage = NULL;
	char	*pat;
	extern  char server_host[];

#ifndef WIN32
	FILE   *outmail;
	char   *margs[5];
	int     mfds[2];
	pid_t   mcpid;
#endif

	/* if force is true, force the mail out regardless of mailpoint */

	if (force != MAIL_FORCE) {
		if (pjob != 0) {

			if (pjob->ji_qs.ji_svrflags & JOB_SVFLG_SubJob) {
				if (pjob->ji_wattr[(int)JOB_ATR_mailpnts].at_flags & ATR_VFLAG_SET) {
					if (strchr(pjob->ji_wattr[(int)JOB_ATR_mailpnts].at_val.at_str,
						MAIL_SUBJOB) == NULL)
						return;
				} else {
					return;
				}
			}

			/* see if user specified mail of this type */

			if (pjob->ji_wattr[(int)JOB_ATR_mailpnts].at_flags & ATR_VFLAG_SET) {
				if (strchr(pjob->ji_wattr[(int)JOB_ATR_mailpnts].at_val.at_str,
					mailpoint) == NULL)
					return;
			} else if (mailpoint != MAIL_ABORT)	/* not set, default to abort */
				return;

		} else if ((server.sv_attr[(int)SRV_ATR_mailfrom].at_flags & ATR_VFLAG_SET) == 0) {

			/* not job related, must be system related;  not sent unless */
			/* forced or if "mailfrom" attribute set         		 */
			return;
		}
	}

	/*
	 * ok, now we will fork a process to do the mailing to not
	 * hold up the server's other work.
	 */

#ifndef WIN32
	mcpid = fork();
	if (mcpid == -1) { /* Error on fork */
		log_err(errno, __func__, "fork failed\n");
		return;
	}
	if (mcpid > 0)
		return;		/* its all up to the child now */

	/*
	 * From here on, we are a child process of the server.
	 * Fix up file descriptors and signal handlers.
	 */
	net_close(-1);
	if (pfn_rpp_terminate)
		rpp_terminate();

	/* Unprotect child from being killed by kernel */
	daemon_protect(0, PBS_DAEMON_PROTECT_OFF);

#endif	/* ! WIN32 */

	/* Who is mail from, if SVR_ATR_mailfrom not set use default */

	if ((mailfrom = server.sv_attr[(int)SRV_ATR_mailfrom].at_val.at_str)==0)
		mailfrom = PBS_DEFAULT_MAIL;

	/* Who does the mail go to?  If mail-list, them; else owner */

	*mailto = '\0';
	if (pjob != 0) {
		if (jid == NULL)
			jid = pjob->ji_qs.ji_jobid;

		if (pjob->ji_wattr[(int)JOB_ATR_mailuser].at_flags & ATR_VFLAG_SET) {

			/* has mail user list, send to them rather than owner */

			pas = pjob->ji_wattr[(int)JOB_ATR_mailuser].at_val.at_arst;
			if (pas != NULL) {
				for (i = 0; i < pas->as_usedptr; i++) {
					addmailhost = 0;
					mailaddrlen += strlen(pas->as_string[i]) + 2;
					if ((pbs_conf.pbs_mail_host_name)  &&
					    (strchr(pas->as_string[i], (int)'@') == NULL)) {
							/* no host specified in address and      */
							/* pbs_mail_host_name is defined, use it */
							mailaddrlen += strlen(pbs_conf.pbs_mail_host_name) + 1;
							addmailhost = 1;
					}
					if (mailaddrlen < sizeof(mailto)) {
						(void)strcat(mailto, pas->as_string[i]);
						if (addmailhost) {
							/* append pbs_mail_host_name */
							(void)strcat(mailto, "@");
							(void)strcat(mailto, pbs_conf.pbs_mail_host_name);
						}
						(void)strcat(mailto, " ");
					} else {
					  	sprintf(log_buffer,"Email list is too long: \"%.77s...\"", mailto);
						log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, LOG_WARNING, pjob->ji_qs.ji_jobid, log_buffer);
						break;
					}
				}
			}

		} else {

			/* no mail user list, just send to owner */

			(void)strncpy(mailto, pjob->ji_wattr[(int)JOB_ATR_job_owner].at_val.at_str, sizeof(mailto));
			mailto[(sizeof(mailto) - 1)] = '\0';
			/* if pbs_mail_host_name is set in pbs.conf, then replace the */
			/* host name with the name specified in pbs_mail_host_name    */
			if (pbs_conf.pbs_mail_host_name) {
				if ((pat = strchr(mailto, (int)'@')) != NULL)
					*pat = '\0';	/* remove existing @host */
				if ((strlen(mailto) + strlen(pbs_conf.pbs_mail_host_name) + 1) < sizeof(mailto)) {
					/* append the pbs_mail_host_name since it fits */
					strcat(mailto, "@");
					strcat(mailto, pbs_conf.pbs_mail_host_name);
				} else {
				  	if (pat)
						*pat = '@';	/* did't fit, restore the "at" sign */
				  	sprintf(log_buffer,"Email address is too long: \"%.77s...\"", mailto);
					log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, LOG_WARNING, pjob->ji_qs.ji_jobid, log_buffer);
				}
			}
		}

	} else {
		/* send system related mail to "mailfrom" */
		strcpy(mailto, mailfrom);
	}

#ifdef WIN32
	/* if pjob is not null, then send a JOB type email (1st param=0); */
	/* otherwise, send a SERVER type email (1st param=2)               */

	send_mail_detach((pjob?0:2), mailfrom, mailto,
		(pjob?pjob->ji_qs.ji_jobid:""), mailpoint,
		(pjob?pjob->ji_wattr[(int)JOB_ATR_jobname].at_val.at_str:""),
		text);

#else
	/* setup sendmail command line with -f from_whom */

	margs[0] = SENDMAIL_CMD;
	margs[1] = "-f";
	margs[2] = mailfrom;
	margs[3] = mailto;
	margs[4] = NULL;

	if (pipe(mfds) == -1)
		exit(1);

	mcpid = fork();
	if(mcpid == 0) {
		/* this child will be sendmail with its stdin set to the pipe */
		if (mfds[0] != 0) {
			(void)close(0);
			if (dup(mfds[0]) == -1)
				exit(1);
		}
		(void)close(1);
		(void)close(2);
		if (execv(SENDMAIL_CMD, margs) == -1)
			exit(1);
	}
	if (mcpid == -1) {/* Error on fork */
		log_err(errno, __func__, "fork failed\n");
		(void)close(mfds[0]);
		exit(1);
	}

	/* parent (not the real server though) will write body of message on pipe */
	(void)close(mfds[0]);
	outmail = fdopen(mfds[1], "w");
	if (outmail == NULL)
		exit(1);

	/* Pipe in mail headers: To: and Subject: */

	fprintf(outmail, "To: %s\n", mailto);

	if (pjob)
		fprintf(outmail, "Subject: PBS JOB %s\n\n", jid);
	else
		fprintf(outmail, "Subject: PBS Server on %s\n\n", server_host);

	/* Now pipe in "standard" message */

	switch (mailpoint) {

		case MAIL_ABORT:
			stdmessage = msg_job_abort;
			break;

		case MAIL_BEGIN:
			stdmessage = msg_job_start;
			break;

		case MAIL_END:
			stdmessage = msg_job_end;
			break;

		case MAIL_STAGEIN:
			stdmessage = msg_job_stageinfail;
			break;

	}

	if (pjob) {
		fprintf(outmail, "PBS Job Id: %s\n", jid);
		fprintf(outmail, "Job Name:   %s\n",
			pjob->ji_wattr[(int)JOB_ATR_jobname].at_val.at_str);
	}
	if (stdmessage)
		fprintf(outmail, "%s\n", stdmessage);
	if (text != NULL)
		fprintf(outmail, "%s\n", text);
	fclose(outmail);

	exit(0);
#endif	/* WIN32 */
}