bool
NamedPipeWriter::write_data(void* buffer, int len)
{
	assert(m_initialized);

	// if we're writing to a pipe that has multiple writers,
	// we need to make sure our messages are no larger than
	// PIPE_BUF to guarantee atomic writes
	//
	assert(len <= PIPE_BUF);

	// if we have a watchdog, we don't go right into a blocking
	// write. instead, we select with both the real pipe and the
	// watchdog pipe, which will close if our peer shuts down or
	// crashes
	//
	if (m_watchdog != NULL) {
		int watchdog_pipe = m_watchdog->get_file_descriptor();
		Selector selector;
		selector.add_fd( m_pipe, Selector::IO_WRITE );
		selector.add_fd( watchdog_pipe, Selector::IO_READ );
		selector.execute();
		if ( selector.failed() || selector.signalled() ) {
			dprintf(D_ALWAYS,
			        "select error: %s (%d)\n",
			        strerror(selector.select_errno()),
			        selector.select_errno());
			return false;
		}
		if ( selector.fd_ready( watchdog_pipe, Selector::IO_READ ) ) {
			dprintf(D_ALWAYS,
			        "error writing to named pipe: "
			            "watchdog pipe has closed\n");
			return false;
		}
	}

	// do the write
	//
	int bytes = write(m_pipe, buffer, len);
	if (bytes != len) {
		if (bytes == -1) {
			dprintf(D_ALWAYS,
			        "write error: %s (%d)\n",
			        strerror(errno),
			        errno);
		}
		else {
			dprintf(D_ALWAYS,
			        "error: wrote %d of %d bytes\n",
			        bytes,
			        len);
		}
		return false;
	}

	return true;
}
Esempio n. 2
0
bool
VanillaProc::JobReaper(int pid, int status)
{
	dprintf(D_FULLDEBUG,"Inside VanillaProc::JobReaper()\n");

	//
	// Run all the reapers first, since some of them change the exit status.
	//
	if( m_pid_ns_status_filename.length() > 0 ) {
		status = pidNameSpaceReaper( status );
	}
	bool jobExited = OsProc::JobReaper( pid, status );
	if( pid != JobPid ) { return jobExited; }

#if defined(LINUX)
	// On newer kernels if memory.use_hierarchy==1, then we cannot disable
	// the OOM killer.  Hence, we have to be ready for a SIGKILL to be delivered
	// by the kernel at the same time we get the notification.  Hence, if we
	// see an exit signal, we must also check the event file descriptor.
	//
	// outOfMemoryEvent() is aware of checkpointing and will mention that
	// the OOM event happened during a checkpoint.
	int efd = -1;
	if( (m_oom_efd >= 0) && daemonCore->Get_Pipe_FD(m_oom_efd, &efd) && (efd != -1) ) {
		Selector selector;
		selector.add_fd(efd, Selector::IO_READ);
		selector.set_timeout(0);
		selector.execute();
		if( !selector.failed() && !selector.timed_out() && selector.has_ready() && selector.fd_ready(efd, Selector::IO_READ) ) {
			outOfMemoryEvent( m_oom_efd );
		}
	}
#endif

	//
	// We have three cases to consider:
	//   * if we're checkpointing; or
	//   * if we see a special checkpoint exit code; or
	//   * there's no special case to consider.
	//

	bool wantsFileTransferOnCheckpointExit = false;
	JobAd->LookupBool( ATTR_WANT_FT_ON_CHECKPOINT, wantsFileTransferOnCheckpointExit );

	int checkpointExitCode = 0;
	JobAd->LookupInteger( ATTR_CHECKPOINT_EXIT_CODE, checkpointExitCode );
	int checkpointExitSignal = 0;
	JobAd->LookupInteger( ATTR_CHECKPOINT_EXIT_SIGNAL, checkpointExitSignal );
	bool checkpointExitBySignal = 0;
	JobAd->LookupBool( ATTR_CHECKPOINT_EXIT_BY_SIGNAL, checkpointExitBySignal );

	int successfulCheckpointStatus = 0;
	if( checkpointExitBySignal ) {
		successfulCheckpointStatus = checkpointExitSignal;
	} else if( checkpointExitCode != 0 ) {
		successfulCheckpointStatus = checkpointExitCode << 8;
#if defined( WINDOWS )
		successfulCheckpointStatus = checkpointExitCode;
#endif
	}

	if( isCheckpointing ) {
		dprintf( D_FULLDEBUG, "Inside VanillaProc::JobReaper() during a checkpoint\n" );

		if( exit_status == successfulCheckpointStatus ) {
			if( isSoftKilling ) {
				notifySuccessfulEvictionCheckpoint();
				return true;
			}

			restartCheckpointedJob();
			isCheckpointing = false;
			return false;
		} else {
			// The job exited without taking a checkpoint.  If we don't do
			// anything, it will be reported as if the error code or signal
			// had happened naturally (and the job will usually exit the
			// queue).  This could confuse the users.
			//
			// Instead, we'll put the job on hold, figuring that if the job
			// requested that we (periodically) send it a signal, and we
			// did, that it's not our fault that the job failed.  This has
			// the convenient side-effect of not overwriting the job's
			// previous checkpoint(s), if any (since file transfer doesn't
			// occur when the job goes on hold).
			killFamilyIfWarranted();
			recordFinalUsage();

			std::string holdMessage;
			formatstr( holdMessage, "Job did not exit as promised when sent its checkpoint signal.  "
				"Promised exit was %s %u, actual exit status was %s %u.",
				checkpointExitBySignal ? "on signal" : "with exit code",
				checkpointExitBySignal ? checkpointExitSignal : checkpointExitCode,
				WIFSIGNALED( exit_status ) ? "on signal" : "with exit code",
				WIFSIGNALED( exit_status ) ? WTERMSIG( exit_status ) : WEXITSTATUS( exit_status ) );
			Starter->jic->holdJob( holdMessage.c_str(), CONDOR_HOLD_CODE_FailedToCheckpoint, exit_status );
			Starter->Hold();
			return true;
		}
	} else if( wantsFileTransferOnCheckpointExit && exit_status == successfulCheckpointStatus ) {
		dprintf( D_FULLDEBUG, "Inside VanillaProc::JobReaper() and the job self-checkpointed.\n" );

		if( isSoftKilling ) {
			notifySuccessfulEvictionCheckpoint();
			return true;
		} else {
			restartCheckpointedJob();
			return false;
		}
	} else {
		// If the parent job process died, clean up all of the job's processes.
		killFamilyIfWarranted();

		// Record final usage stats for this process family, since
		// once the reaper returns, the family is no longer
		// registered with DaemonCore and we'll never be able to
		// get this information again.
		recordFinalUsage();

		return jobExited;
	}
}