//--------------------------------------------------------------------------- void JobstateLog::Write( const time_t *eventTimeP, const MyString &info ) { // // Here for "fake" events like JOB_SUCCESS, the event will get // the timestamp of the last "real" event from the job; this is // so that we can correctly avoid re-writing the "fake" events // in recovery mode. // time_t eventTime; if ( eventTimeP != NULL && *eventTimeP != 0 ) { eventTime = *eventTimeP; } else { eventTime = time( NULL ); } // Avoid "re-writing" events in recovery mode: // If the event time is *after* _lastTimestampWritten, we // write the event. If the event time is *before* // _lastTimestampWritten, we don't write the event. If // the times are equal, we have to do a further test down // below. if ( eventTime < _lastTimestampWritten ) { return; } MyString outline; outline.formatstr( "%lu %s", (unsigned long)eventTime, info.Value() ); // // If this event's time matches the time of the last "real" // event in the pre-recovery part of the file, we check whether // this line is already in the pre-recovery part of the file, // and if it is we don't write it again. // if ( (eventTime == _lastTimestampWritten) && (_lastTimestampLines.count( outline ) > 0) ) { return; } if ( !_outfile ) { _outfile = safe_fopen_wrapper_follow( _jobstateLogFile, "a" ); if ( !_outfile ) { debug_printf( DEBUG_QUIET, "Could not open jobstate log file %s for writing.\n", _jobstateLogFile ); main_shutdown_graceful(); return; } } fprintf( _outfile, "%s\n", outline.Value() ); }
//--------------------------------------------------------------------------- void JobstateLog::Flush() { if ( !_jobstateLogFile ) { return; } if ( fflush( _outfile ) != 0 ) { debug_printf( DEBUG_QUIET, "Error flushing output to jobstate log file %s.\n", _jobstateLogFile ); main_shutdown_graceful(); } }
//--------------------------------------------------------------------------- // Here we re-read the jobstate.log file to find out what sequence number // we should start with when running a rescue DAG. void JobstateLog::InitializeRescue() { debug_printf( DEBUG_DEBUG_2, "JobstateLog::InitializeRescue()\n" ); if ( !_jobstateLogFile ) { return; } FILE *infile = safe_fopen_wrapper_follow( _jobstateLogFile, "r" ); if ( !infile ) { // This is a fatal error, because by the time we get here, // we should, at the very least, have written the // DAGMAN_STARTED "event". debug_printf( DEBUG_QUIET, "Could not open jobstate log file %s for reading.\n", _jobstateLogFile ); main_shutdown_graceful(); return; } int maxSeqNum = 0; MyString line; while ( line.readLine( infile ) ) { time_t newTimestamp; MyString nodeName; int seqNum; if ( ParseLine( line, newTimestamp, nodeName, seqNum ) ) { maxSeqNum = MAX( maxSeqNum, seqNum ); } } fclose( infile ); debug_printf( DEBUG_DEBUG_2, "Max sequence num in jobstate.log file: %d\n", maxSeqNum ); Job::SetJobstateNextSequenceNum( maxSeqNum + 1 ); }
/* ** Callback from daemon-core kill all daemons and go away. */ void main_shutdown_normal() { // if we are doing peaceful tell the children, and set a timer to do the real shutdown // so the children have a chance to notice the messages // bool fTimer = false; if (daemonCore->GetPeacefulShutdown()) { int timeout = 5; if (daemons.SetPeacefulShutdown(timeout) > 0) { int tid = daemonCore->Register_Timer(timeout+1, 0, (TimerHandler)main_shutdown_graceful, "main_shutdown_graceful"); if (tid == -1) dprintf( D_ALWAYS, "ERROR! Can't register DaemonCore timer!\n" ); else fTimer = true; } } if ( ! fTimer) { main_shutdown_graceful(); } }
void init_params() { char *tmp; static int master_name_in_config = 0; if( ! master_name_in_config ) { // First time, or we know it's not in the config file. if( ! MasterName ) { // Not set on command line tmp = param( "MASTER_NAME" ); if( tmp ) { MasterName = build_valid_daemon_name( tmp ); master_name_in_config = 1; free( tmp ); } } } else { delete [] MasterName; tmp = param( "MASTER_NAME" ); MasterName = build_valid_daemon_name( tmp ); free( tmp ); } if( MasterName ) { dprintf( D_FULLDEBUG, "Using name: %s\n", MasterName ); } if (!param_boolean_crufty("START_MASTER", true)) { dprintf( D_ALWAYS, "START_MASTER was set to FALSE, shutting down.\n" ); StartDaemons = FALSE; main_shutdown_graceful(); } StartDaemons = TRUE; if (!param_boolean_crufty("START_DAEMONS", true)) { dprintf( D_ALWAYS, "START_DAEMONS flag was set to FALSE. Not starting daemons.\n" ); StartDaemons = FALSE; } // If we were sent the daemons_off command, don't forget that // here. if( GotDaemonsOff ) { StartDaemons = FALSE; } PublishObituaries = param_boolean_crufty("PUBLISH_OBITUARIES", true) ? TRUE : FALSE; Lines = param_integer("OBITUARY_LOG_LENGTH",20); master_backoff_constant = param_integer( "MASTER_BACKOFF_CONSTANT", 9, 1 ); master_backoff_ceiling = param_integer( "MASTER_BACKOFF_CEILING", 3600,1 ); master_backoff_factor = param_double( "MASTER_BACKOFF_FACTOR", 2.0, 0 ); if( master_backoff_factor <= 0.0 ) { master_backoff_factor = 2.0; } master_recover_time = param_integer( "MASTER_RECOVER_FACTOR", 300, 1 ); update_interval = param_integer( "MASTER_UPDATE_INTERVAL", 5 * MINUTE, 1 ); check_new_exec_interval = param_integer( "MASTER_CHECK_NEW_EXEC_INTERVAL", 5*MINUTE ); new_bin_delay = param_integer( "MASTER_NEW_BINARY_DELAY", 2*MINUTE, 1 ); new_bin_restart_mode = GRACEFUL; char * restart_mode = param("MASTER_NEW_BINARY_RESTART"); if (restart_mode) { #if 1 StopStateT mode = StringToStopState(restart_mode); #else static const struct { const char * text; StopStateT mode; } modes[] = { { "GRACEFUL", GRACEFUL }, { "PEACEFUL", PEACEFUL }, { "NEVER", NONE }, { "NONE", NONE }, { "NO", NONE }, // { "FAST", FAST }, // { "KILL", KILL }, }; StopStateT mode = (StopStateT)-1; // prime with -1 so we can detect bad input. for (int ii = 0; ii < (int)COUNTOF(modes); ++ii) { if (MATCH == strcasecmp(restart_mode, modes[ii].text)) { mode = modes[ii].mode; break; } } #endif if (mode == (StopStateT)-1) { dprintf(D_ALWAYS, "%s is not a valid value for MASTER_NEW_BINARY_RESTART. using GRACEFUL\n", restart_mode); } if (mode >= 0 && mode <= NONE) new_bin_restart_mode = mode; free(restart_mode); } preen_interval = param_integer( "PREEN_INTERVAL", 24*HOUR, 0 ); if(preen_interval == 0) { EXCEPT("PREEN_INTERVAL in the condor configuration is too low (0). Please set it to an integer in the range 1 to %d (default %d). To disable condor_preen entirely, comment out PREEN.", INT_MAX, 24*HOUR); } shutdown_fast_timeout = param_integer( "SHUTDOWN_FAST_TIMEOUT", 5*MINUTE, 1 ); shutdown_graceful_timeout = param_integer( "SHUTDOWN_GRACEFUL_TIMEOUT", 30*MINUTE, 1 ); AllowAdminCommands = param_boolean( "ALLOW_ADMIN_COMMANDS", true ); if( FS_Preen ) { free( FS_Preen ); } FS_Preen = param( "PREEN" ); }
// Note: for this to work correctly, it's vital that the events we generate // in recovery mode exactly match how they were output in "non-recovery" // mode, so we can compare timestamps, and, if the timestamp matches // the last pre-recovery timestamp, the entire event string. void JobstateLog::InitializeRecovery() { debug_printf( DEBUG_DEBUG_2, "JobstateLog::InitializeRecovery()\n" ); if ( !_jobstateLogFile ) { return; } // // Find the timestamp of the last "real" event written to the // jobstate.log file. Any events that we see in recovery mode // that have an earlier timestamp should *not* be re-written // to the jobstate.log file. Any events with later timestamps // should be written. Events with equal timestamps need to be // tested individually. // FILE *infile = safe_fopen_wrapper_follow( _jobstateLogFile, "r" ); if ( !infile ) { // This is a fatal error, because by the time we get here, // we should, at the very least, have written the // DAGMAN_STARTED "event". debug_printf( DEBUG_QUIET, "Could not open jobstate log file %s for reading.\n", _jobstateLogFile ); main_shutdown_graceful(); return; } MyString line; off_t startOfLastTimestamp = 0; while ( true ) { off_t currentOffset = ftell( infile ); if ( !line.readLine( infile ) ) { break; } time_t newTimestamp; MyString nodeName; int seqNum; if ( ParseLine( line, newTimestamp, nodeName, seqNum ) ) { // We don't want to look at "INTERNAL" events here, or we'll // get goofed up by our own DAGMAN_STARTED event, etc. if ( nodeName != INTERNAL_NAME ) { // Note: we don't absolutely rely on the timestamps // being in order -- the > below rather than == is // important in that case. if ( newTimestamp > _lastTimestampWritten ) { startOfLastTimestamp = currentOffset; _lastTimestampWritten = newTimestamp; } } } } debug_printf( DEBUG_DEBUG_2, "_lastTimestampWritten: %lu\n", (unsigned long)_lastTimestampWritten ); // // Now find all lines that match the last timestamp, and put // them into a hash table for future reference. // if ( fseek( infile, startOfLastTimestamp, SEEK_SET ) != 0 ) { debug_printf( DEBUG_QUIET, "Error seeking in jobstate log file %s.\n", _jobstateLogFile ); } while ( line.readLine( infile ) ) { time_t newTimestamp; MyString nodeName; int seqNum; if ( ParseLine( line, newTimestamp, nodeName, seqNum ) ) { if ( (newTimestamp == _lastTimestampWritten) && (nodeName != INTERNAL_NAME) ) { _lastTimestampLines.insert( line ); debug_printf( DEBUG_DEBUG_2, "Appended <%s> to _lastTimestampLines\n", line.Value() ); } } } fclose( infile ); }