コード例 #1
0
//---------------------------------------------------------------------------
void
JobstateLog::Write( const time_t *eventTimeP, const MyString &info )
{
		//
		// Here for "fake" events like JOB_SUCCESS, the event will get
		// the timestamp of the last "real" event from the job; this is
		// so that we can correctly avoid re-writing the "fake" events
		// in recovery mode.
		//
	time_t eventTime;
	if ( eventTimeP != NULL && *eventTimeP != 0 ) {
		eventTime = *eventTimeP;
	} else {
		eventTime = time( NULL );
	}

		// Avoid "re-writing" events in recovery mode:
		// If the event time is *after* _lastTimestampWritten, we
		// write the event.  If the event time is *before*
		// _lastTimestampWritten, we don't write the event.  If
		// the times are equal, we have to do a further test down
		// below.
	if ( eventTime < _lastTimestampWritten ) {
		return;
	}

	MyString outline;
	outline.formatstr( "%lu %s", (unsigned long)eventTime, info.Value() );

		//
		// If this event's time matches the time of the last "real"
		// event in the pre-recovery part of the file, we check whether
		// this line is already in the pre-recovery part of the file,
		// and if it is we don't write it again.
		//
	if ( (eventTime == _lastTimestampWritten) &&
				(_lastTimestampLines.count( outline ) > 0) ) {
		return;
	}

	if ( !_outfile ) {
		_outfile = safe_fopen_wrapper_follow( _jobstateLogFile, "a" );
		if ( !_outfile ) {
       		debug_printf( DEBUG_QUIET,
						"Could not open jobstate log file %s for writing.\n",
						_jobstateLogFile );
			main_shutdown_graceful();
			return;
		}
	}

	fprintf( _outfile, "%s\n", outline.Value() );
}
コード例 #2
0
//---------------------------------------------------------------------------
void
JobstateLog::Flush()
{
	if ( !_jobstateLogFile ) {
		return;
	}

	if ( fflush( _outfile ) != 0 ) {
		debug_printf( DEBUG_QUIET,
					"Error flushing output to jobstate log file %s.\n",
					_jobstateLogFile );
		main_shutdown_graceful();
	}
}
コード例 #3
0
//---------------------------------------------------------------------------
// Here we re-read the jobstate.log file to find out what sequence number
// we should start with when running a rescue DAG.
void
JobstateLog::InitializeRescue()
{
	debug_printf( DEBUG_DEBUG_2, "JobstateLog::InitializeRescue()\n" );

	if ( !_jobstateLogFile ) {
		return;
	}

	FILE *infile = safe_fopen_wrapper_follow( _jobstateLogFile, "r" );
	if ( !infile ) {
			// This is a fatal error, because by the time we get here,
			// we should, at the very least, have written the
			// DAGMAN_STARTED "event".
		debug_printf( DEBUG_QUIET,
					"Could not open jobstate log file %s for reading.\n",
					_jobstateLogFile );
		main_shutdown_graceful();
		return;
	}

	int maxSeqNum = 0;
	MyString line;

	while ( line.readLine( infile ) ) {
		time_t newTimestamp;
		MyString nodeName;
		int seqNum;
		if ( ParseLine( line, newTimestamp, nodeName, seqNum ) ) {
			maxSeqNum = MAX( maxSeqNum, seqNum );
		}
	}

	fclose( infile );

	debug_printf( DEBUG_DEBUG_2,
				"Max sequence num in jobstate.log file: %d\n", maxSeqNum );

	Job::SetJobstateNextSequenceNum( maxSeqNum + 1 );
}
コード例 #4
0
ファイル: master.cpp プロジェクト: zhangzhehust/htcondor
/*
 ** Callback from daemon-core kill all daemons and go away. 
 */
void
main_shutdown_normal()
{
	// if we are doing peaceful tell the children, and set a timer to do the real shutdown
	// so the children have a chance to notice the messages
	//
	bool fTimer = false;
	if (daemonCore->GetPeacefulShutdown()) {
		int timeout = 5;
		if (daemons.SetPeacefulShutdown(timeout) > 0) {
			int tid = daemonCore->Register_Timer(timeout+1, 0,
							(TimerHandler)main_shutdown_graceful,
							"main_shutdown_graceful");
			if (tid == -1)
				dprintf( D_ALWAYS, "ERROR! Can't register DaemonCore timer!\n" );
			else
				fTimer = true;
		}
	}

	if ( ! fTimer) {
		main_shutdown_graceful();
	}
}
コード例 #5
0
ファイル: master.cpp プロジェクト: zhangzhehust/htcondor
void
init_params()
{
	char	*tmp;
	static	int	master_name_in_config = 0;

	if( ! master_name_in_config ) {
			// First time, or we know it's not in the config file. 
		if( ! MasterName ) {
				// Not set on command line
			tmp = param( "MASTER_NAME" );
			if( tmp ) {
				MasterName = build_valid_daemon_name( tmp );
				master_name_in_config = 1;
				free( tmp );
			} 
		}
	} else {
		delete [] MasterName;
		tmp = param( "MASTER_NAME" );
		MasterName = build_valid_daemon_name( tmp );
		free( tmp );
	}
	if( MasterName ) {
		dprintf( D_FULLDEBUG, "Using name: %s\n", MasterName );
	}
			
	if (!param_boolean_crufty("START_MASTER", true)) {
			dprintf( D_ALWAYS, "START_MASTER was set to FALSE, shutting down.\n" );
			StartDaemons = FALSE;
			main_shutdown_graceful();
	}

		
	StartDaemons = TRUE;
	if (!param_boolean_crufty("START_DAEMONS", true)) {
			dprintf( D_ALWAYS, 
					 "START_DAEMONS flag was set to FALSE.  Not starting daemons.\n" );
			StartDaemons = FALSE;
	} 
		// If we were sent the daemons_off command, don't forget that
		// here. 
	if( GotDaemonsOff ) {
		StartDaemons = FALSE;
	}

	PublishObituaries = param_boolean_crufty("PUBLISH_OBITUARIES", true) ? TRUE : FALSE;

	Lines = param_integer("OBITUARY_LOG_LENGTH",20);

	master_backoff_constant = param_integer( "MASTER_BACKOFF_CONSTANT", 9, 1 );

	master_backoff_ceiling = param_integer( "MASTER_BACKOFF_CEILING", 3600,1 );

	master_backoff_factor = param_double( "MASTER_BACKOFF_FACTOR", 2.0, 0 );
	if( master_backoff_factor <= 0.0 ) {
    	master_backoff_factor = 2.0;
    }
	
	master_recover_time = param_integer( "MASTER_RECOVER_FACTOR", 300, 1 );

	update_interval = param_integer( "MASTER_UPDATE_INTERVAL", 5 * MINUTE, 1 );

	check_new_exec_interval = param_integer( "MASTER_CHECK_NEW_EXEC_INTERVAL", 5*MINUTE );

	new_bin_delay = param_integer( "MASTER_NEW_BINARY_DELAY", 2*MINUTE, 1 );

	new_bin_restart_mode = GRACEFUL;
	char * restart_mode = param("MASTER_NEW_BINARY_RESTART");
	if (restart_mode) {
#if 1
		StopStateT mode = StringToStopState(restart_mode);
#else
		static const struct {
			const char * text;
			StopStateT   mode;
			} modes[] = {
				{ "GRACEFUL", GRACEFUL },
				{ "PEACEFUL", PEACEFUL },
				{ "NEVER", NONE }, { "NONE", NONE }, { "NO", NONE },
			//	{ "FAST", FAST },
			//	{ "KILL", KILL },
			};
		StopStateT mode = (StopStateT)-1; // prime with -1 so we can detect bad input.
		for (int ii = 0; ii < (int)COUNTOF(modes); ++ii) {
			if (MATCH == strcasecmp(restart_mode, modes[ii].text)) {
				mode = modes[ii].mode;
				break;
			}
		}
#endif
		if (mode == (StopStateT)-1)	{
			dprintf(D_ALWAYS, "%s is not a valid value for MASTER_NEW_BINARY_RESTART. using GRACEFUL\n", restart_mode);
		}
		if (mode >= 0 && mode <= NONE)
			new_bin_restart_mode = mode;
		free(restart_mode);
	}

	preen_interval = param_integer( "PREEN_INTERVAL", 24*HOUR, 0 );
	if(preen_interval == 0) {
		EXCEPT("PREEN_INTERVAL in the condor configuration is too low (0).  Please set it to an integer in the range 1 to %d (default %d).  To disable condor_preen entirely, comment out PREEN.", INT_MAX, 24*HOUR);

	}

	shutdown_fast_timeout = param_integer( "SHUTDOWN_FAST_TIMEOUT", 5*MINUTE, 1 );

	shutdown_graceful_timeout = param_integer( "SHUTDOWN_GRACEFUL_TIMEOUT", 30*MINUTE, 1 );

	AllowAdminCommands = param_boolean( "ALLOW_ADMIN_COMMANDS", true );

	if( FS_Preen ) {
		free( FS_Preen );
	}
	FS_Preen = param( "PREEN" );
}
コード例 #6
0
// Note: for this to work correctly, it's vital that the events we generate
// in recovery mode exactly match how they were output in "non-recovery"
// mode, so we can compare timestamps, and, if the timestamp matches
// the last pre-recovery timestamp, the entire event string.
void
JobstateLog::InitializeRecovery()
{
	debug_printf( DEBUG_DEBUG_2, "JobstateLog::InitializeRecovery()\n" );

	if ( !_jobstateLogFile ) {
		return;
	}

		//
		// Find the timestamp of the last "real" event written to the
		// jobstate.log file.  Any events that we see in recovery mode
		// that have an earlier timestamp should *not* be re-written
		// to the jobstate.log file.  Any events with later timestamps
		// should be written.  Events with equal timestamps need to be
		// tested individually.
		//

	FILE *infile = safe_fopen_wrapper_follow( _jobstateLogFile, "r" );
	if ( !infile ) {
			// This is a fatal error, because by the time we get here,
			// we should, at the very least, have written the
			// DAGMAN_STARTED "event".
		debug_printf( DEBUG_QUIET,
					"Could not open jobstate log file %s for reading.\n",
					_jobstateLogFile );
		main_shutdown_graceful();
		return;
	}

	MyString line;
	off_t startOfLastTimestamp = 0;

	while ( true ) {
		off_t currentOffset = ftell( infile );
		if ( !line.readLine( infile ) ) {
			break;
		}

		time_t newTimestamp;
		MyString nodeName;
		int seqNum;
		if ( ParseLine( line, newTimestamp, nodeName, seqNum ) ) {
				// We don't want to look at "INTERNAL" events here, or we'll
				// get goofed up by our own DAGMAN_STARTED event, etc.
			if ( nodeName != INTERNAL_NAME ) {
					// Note: we don't absolutely rely on the timestamps
					// being in order -- the > below rather than == is
					// important in that case.
				if ( newTimestamp > _lastTimestampWritten ) {
					startOfLastTimestamp = currentOffset;
					_lastTimestampWritten = newTimestamp;
				}
			}
		}
	}

	debug_printf( DEBUG_DEBUG_2, "_lastTimestampWritten: %lu\n",
				(unsigned long)_lastTimestampWritten );

		//
		// Now find all lines that match the last timestamp, and put
		// them into a hash table for future reference.
		//
	if ( fseek( infile, startOfLastTimestamp, SEEK_SET ) != 0 ) {
		debug_printf( DEBUG_QUIET,
					"Error seeking in jobstate log file %s.\n",
					_jobstateLogFile );
	}

	while ( line.readLine( infile ) ) {
		time_t newTimestamp;
		MyString nodeName;
		int seqNum;
		if ( ParseLine( line, newTimestamp, nodeName, seqNum ) ) {
			if ( (newTimestamp == _lastTimestampWritten) &&
						(nodeName != INTERNAL_NAME) ) {
				_lastTimestampLines.insert( line );
				debug_printf( DEBUG_DEBUG_2,
							"Appended <%s> to _lastTimestampLines\n",
							line.Value() );
			}
		}
	}

	fclose( infile );
}