bool NamedPipeWriter::write_data(void* buffer, int len) { assert(m_initialized); // if we're writing to a pipe that has multiple writers, // we need to make sure our messages are no larger than // PIPE_BUF to guarantee atomic writes // assert(len <= PIPE_BUF); // if we have a watchdog, we don't go right into a blocking // write. instead, we select with both the real pipe and the // watchdog pipe, which will close if our peer shuts down or // crashes // if (m_watchdog != NULL) { int watchdog_pipe = m_watchdog->get_file_descriptor(); Selector selector; selector.add_fd( m_pipe, Selector::IO_WRITE ); selector.add_fd( watchdog_pipe, Selector::IO_READ ); selector.execute(); if ( selector.failed() || selector.signalled() ) { dprintf(D_ALWAYS, "select error: %s (%d)\n", strerror(selector.select_errno()), selector.select_errno()); return false; } if ( selector.fd_ready( watchdog_pipe, Selector::IO_READ ) ) { dprintf(D_ALWAYS, "error writing to named pipe: " "watchdog pipe has closed\n"); return false; } } // do the write // int bytes = write(m_pipe, buffer, len); if (bytes != len) { if (bytes == -1) { dprintf(D_ALWAYS, "write error: %s (%d)\n", strerror(errno), errno); } else { dprintf(D_ALWAYS, "error: wrote %d of %d bytes\n", bytes, len); } return false; } return true; }
bool VanillaProc::JobReaper(int pid, int status) { dprintf(D_FULLDEBUG,"Inside VanillaProc::JobReaper()\n"); // // Run all the reapers first, since some of them change the exit status. // if( m_pid_ns_status_filename.length() > 0 ) { status = pidNameSpaceReaper( status ); } bool jobExited = OsProc::JobReaper( pid, status ); if( pid != JobPid ) { return jobExited; } #if defined(LINUX) // On newer kernels if memory.use_hierarchy==1, then we cannot disable // the OOM killer. Hence, we have to be ready for a SIGKILL to be delivered // by the kernel at the same time we get the notification. Hence, if we // see an exit signal, we must also check the event file descriptor. // // outOfMemoryEvent() is aware of checkpointing and will mention that // the OOM event happened during a checkpoint. int efd = -1; if( (m_oom_efd >= 0) && daemonCore->Get_Pipe_FD(m_oom_efd, &efd) && (efd != -1) ) { Selector selector; selector.add_fd(efd, Selector::IO_READ); selector.set_timeout(0); selector.execute(); if( !selector.failed() && !selector.timed_out() && selector.has_ready() && selector.fd_ready(efd, Selector::IO_READ) ) { outOfMemoryEvent( m_oom_efd ); } } #endif // // We have three cases to consider: // * if we're checkpointing; or // * if we see a special checkpoint exit code; or // * there's no special case to consider. // bool wantsFileTransferOnCheckpointExit = false; JobAd->LookupBool( ATTR_WANT_FT_ON_CHECKPOINT, wantsFileTransferOnCheckpointExit ); int checkpointExitCode = 0; JobAd->LookupInteger( ATTR_CHECKPOINT_EXIT_CODE, checkpointExitCode ); int checkpointExitSignal = 0; JobAd->LookupInteger( ATTR_CHECKPOINT_EXIT_SIGNAL, checkpointExitSignal ); bool checkpointExitBySignal = 0; JobAd->LookupBool( ATTR_CHECKPOINT_EXIT_BY_SIGNAL, checkpointExitBySignal ); int successfulCheckpointStatus = 0; if( checkpointExitBySignal ) { successfulCheckpointStatus = checkpointExitSignal; } else if( checkpointExitCode != 0 ) { successfulCheckpointStatus = checkpointExitCode << 8; #if defined( WINDOWS ) successfulCheckpointStatus = checkpointExitCode; #endif } if( isCheckpointing ) { dprintf( D_FULLDEBUG, "Inside VanillaProc::JobReaper() during a checkpoint\n" ); if( exit_status == successfulCheckpointStatus ) { if( isSoftKilling ) { notifySuccessfulEvictionCheckpoint(); return true; } restartCheckpointedJob(); isCheckpointing = false; return false; } else { // The job exited without taking a checkpoint. If we don't do // anything, it will be reported as if the error code or signal // had happened naturally (and the job will usually exit the // queue). This could confuse the users. // // Instead, we'll put the job on hold, figuring that if the job // requested that we (periodically) send it a signal, and we // did, that it's not our fault that the job failed. This has // the convenient side-effect of not overwriting the job's // previous checkpoint(s), if any (since file transfer doesn't // occur when the job goes on hold). killFamilyIfWarranted(); recordFinalUsage(); std::string holdMessage; formatstr( holdMessage, "Job did not exit as promised when sent its checkpoint signal. " "Promised exit was %s %u, actual exit status was %s %u.", checkpointExitBySignal ? "on signal" : "with exit code", checkpointExitBySignal ? checkpointExitSignal : checkpointExitCode, WIFSIGNALED( exit_status ) ? "on signal" : "with exit code", WIFSIGNALED( exit_status ) ? WTERMSIG( exit_status ) : WEXITSTATUS( exit_status ) ); Starter->jic->holdJob( holdMessage.c_str(), CONDOR_HOLD_CODE_FailedToCheckpoint, exit_status ); Starter->Hold(); return true; } } else if( wantsFileTransferOnCheckpointExit && exit_status == successfulCheckpointStatus ) { dprintf( D_FULLDEBUG, "Inside VanillaProc::JobReaper() and the job self-checkpointed.\n" ); if( isSoftKilling ) { notifySuccessfulEvictionCheckpoint(); return true; } else { restartCheckpointedJob(); return false; } } else { // If the parent job process died, clean up all of the job's processes. killFamilyIfWarranted(); // Record final usage stats for this process family, since // once the reaper returns, the family is no longer // registered with DaemonCore and we'll never be able to // get this information again. recordFinalUsage(); return jobExited; } }