Exemple #1
0
void
AvailStats::compute( amask_t how_much )
{
	if( !compute_avail_stats ) return;
	if( IS_STATIC(how_much) && IS_SHARED(how_much) ) {

		as_avail_confidence = param_double("STARTD_AVAIL_CONFIDENCE", as_avail_confidence, 0, 1);

		as_max_avail_periods = param_integer("STARTD_MAX_AVAIL_PERIOD_SAMPLES", as_max_avail_periods);
	}

	if( IS_UPDATE(how_much) && IS_SHARED(how_much) ) {
		time_t current_time = time(0);

			// only compute avail time estimate if we're in non-owner state
		if( as_start_avail ) {

				// first, skip over intervals less than our idle time so far
			int current_idle_time = current_time - as_start_avail;
			int num_intervals = as_num_avail_periods;
			as_avail_periods.Rewind();
			int item = 0;
			as_avail_periods.Next(item);
			while( num_intervals && item < current_idle_time ) { 
				as_avail_periods.Next(item);
				num_intervals--;
			}

			if( !num_intervals ) {
					// If this is the longest we've ever been idle, our
					// historical data isn't too useful to us, so give an
					// estimate based on how long we've been idle so far.
				as_avail_estimate =
					(int)floor(current_idle_time*(2.0-as_avail_confidence));
			} else {
					// Otherwise, find the record in our history at the
					// requested confidence level.
				int idx = (int)floor(num_intervals*(1.0-as_avail_confidence));
				while( idx ) {
					as_avail_periods.Next(item);
					idx--;
				}
				as_avail_estimate = item;
			}
			as_avail_time =
				float(as_tot_avail_time + current_idle_time) /
				float(current_time - as_birthdate);
		} else {
			as_avail_time =	float(as_tot_avail_time) /
							float(current_time - as_birthdate);
		}
	}
}
Exemple #2
0
void StarterStatistics::Reconfig() {
    int quantum = param_integer("STATISTICS_WINDOW_QUANTUM_STARTER", INT_MAX, 1, INT_MAX);
    if (quantum >= INT_MAX)
        quantum = param_integer("STATISTICS_WINDOW_QUANTUM", 4*60, 1, INT_MAX);
    this->RecentWindowQuantum = quantum;

    int window = param_integer("STATISTICS_WINDOW_SECONDS_STARTER", INT_MAX, 1, INT_MAX);
    if (window >= INT_MAX)
        window = param_integer("STATISTICS_WINDOW_SECONDS", 1200, quantum, INT_MAX);
    this->RecentWindowMax = window;

    this->RecentWindowMax = window;
    Pool.SetRecentMax(window, this->RecentWindowQuantum);

    this->PublishFlags = IF_BASICPUB | IF_RECENTPUB;
    char* tmp = param("STATISTICS_TO_PUBLISH");
    if (tmp) {
       this->PublishFlags = generic_stats_ParseConfigString(tmp, "STARTER", "_no_alternate_name_", this->PublishFlags);
       free(tmp);
    }
}
Exemple #3
0
void
main_init(int argc, char *argv[])
{
	_EXCEPT_Cleanup = ExceptCleanup;

		/* Start up with condor.condor privileges. */
	set_condor_priv();

		// Register a do-nothing reaper.  This is just because the
		// file transfer object, which could be instantiated later,
		// registers its own reaper and does an EXCEPT if it gets
		// a reaper ID of 1 (since lots of other daemons have a reaper
		// ID of 1 hard-coded as special... this is bad).
	daemonCore->Register_Reaper("dummy_reaper",
							(ReaperHandler)&dummy_reaper,
							"dummy_reaper",NULL);


		// register SIGUSR1 (condor_rm) for shutdown...
	daemonCore->Register_Signal( SIGUSR1, "SIGUSR1", 
		(SignalHandler)&handleSignals,"handleSignals");
		// register UPDATE_JOBAD for qedit changes
	daemonCore->Register_Signal( UPDATE_JOBAD, "UPDATE_JOBAD", 
		(SignalHandler)&handleSignals,"handleSignals");
		// handle daemoncore signals which are passed down
	daemonCore->Register_Signal( DC_SIGSUSPEND, "DC_SIGSUSPEND", 
		(SignalHandler)&handleSignals,"handleSignals");
	daemonCore->Register_Signal( DC_SIGCONTINUE, "DC_SIGCONTINUE", 
		(SignalHandler)&handleSignals,"handleSignals");

	int shadow_worklife = param_integer( "SHADOW_WORKLIFE", 3600 );
	if( shadow_worklife > 0 ) {
		shadow_worklife_expires = time(NULL) + shadow_worklife;
	}
	else if( shadow_worklife == 0 ) {
			// run one job and then exit
		shadow_worklife_expires = time(NULL)-1;
	}
	else {
		shadow_worklife_expires = 0;
	}

	parseArgs( argc, argv );

	CheckSpoolVersion(SPOOL_MIN_VERSION_SHADOW_SUPPORTS,SPOOL_CUR_VERSION_SHADOW_SUPPORTS);

	ClassAd* ad = readJobAd();
	if( ! ad ) {
		EXCEPT( "Failed to read job ad!" );
	}

	startShadow( ad );
}
Exemple #4
0
// Initialize to read the global event log
bool
ReadUserLog::initialize( void )
{
	char	*path = param( "EVENT_LOG" );
	if ( NULL == path ) {
		Error( LOG_ERROR_FILE_NOT_FOUND,__LINE__ );
		return false;
	}
	int max_rotations = param_integer( "EVENT_LOG_MAX_ROTATIONS", 1, 0 );
	bool status = initialize( path, max_rotations, true );
	free( path );
	return status;
}
Exemple #5
0
void BaseShadow::config()
{
	if (spool) free(spool);
	spool = param("SPOOL");
	if (!spool) {
		EXCEPT("SPOOL not specified in config file.");
	}

	if (fsDomain) free(fsDomain);
	fsDomain = param( "FILESYSTEM_DOMAIN" );
	if (!fsDomain) {
		EXCEPT("FILESYSTEM_DOMAIN not specified in config file.");
	}

	if (uidDomain) free(uidDomain);
	uidDomain = param( "UID_DOMAIN" );
	if (!uidDomain) {
		EXCEPT("UID_DOMAIN not specified in config file.");
	}

	reconnect_ceiling = param_integer( "RECONNECT_BACKOFF_CEILING", 300 );

	reconnect_e_factor = 0.0;
	reconnect_e_factor = param_double( "RECONNECT_BACKOFF_FACTOR", 2.0, 0.0 );
	if( reconnect_e_factor < -1e-4 || reconnect_e_factor > 1e-4) {
    	reconnect_e_factor = 2.0;
    }

	m_cleanup_retry_tid = -1;
	m_num_cleanup_retries = 0;
		// NOTE: these config knobs are very similar to
		// LOCAL_UNIVERSE_MAX_JOB_CLEANUP_RETRIES and
		// LOCAL_UNIVERSE_JOB_CLEANUP_RETRY_DELAY in the local starter.
	m_max_cleanup_retries = param_integer("SHADOW_MAX_JOB_CLEANUP_RETRIES", 5);
	m_cleanup_retry_delay = param_integer("SHADOW_JOB_CLEANUP_RETRY_DELAY", 30);

	m_lazy_queue_update = param_boolean("SHADOW_LAZY_QUEUE_UPDATE", true);
}
void
TransferQueueManager::parseThrottleConfig(char const *config_param,bool &enable_throttle,double &low,double &high,std::string &throttle_short_horizon,std::string &throttle_long_horizon,time_t &throttle_increment_wait)
{
	enable_throttle = false;

	std::string throttle_config;
	if( !param(throttle_config,config_param) ) {
		return;
	}

	char *endptr=NULL;
	low = strtod(throttle_config.c_str(),&endptr);
	if( !endptr || !(*endptr == ' ' || *endptr == '\0') ) {
		EXCEPT("Invalid configuration for %s: %s\n",config_param,throttle_config.c_str());
		return;
	}

	while( *endptr == ' ' ) endptr++;

	if( *endptr == '\0' ) {
		high = low;
		low = 0.9*high;
	}
	else if( strncmp(endptr,"to ",3)==0 ) {
		endptr += 3;
		while( *endptr == ' ' ) endptr++;

		high = strtod(endptr,&endptr);
		if( !endptr || *endptr != '\0' ) {
			dprintf(D_ALWAYS,"Invalid configuration for %s: %s\n",config_param,throttle_config.c_str());
			return;
		}
	}
	else {
		EXCEPT("Invalid configuration for %s: %s\n",config_param,throttle_config.c_str());
	}

		// for now, these are hard-coded
	std::string horizon_param;
	formatstr(horizon_param,"%s_SHORT_HORIZON",config_param);
	param(throttle_short_horizon,horizon_param.c_str(),"1m");
	formatstr(horizon_param,"%s_LONG_HORIZON",config_param);
	param(throttle_long_horizon,horizon_param.c_str(),"5m");

	std::string wait_param;
	formatstr(wait_param,"%s_WAIT_BETWEEN_INCREMENTS",config_param);
	throttle_increment_wait = (time_t) param_integer(wait_param.c_str(),60,0);

	m_throttle_disk_load = true;
}
Exemple #7
0
bool SelfMonitorData::ExportData(ClassAd *ad)
{
    bool      success;
    MyString  attribute;

    if (ad == NULL) {
        success = false;
    } else {
        ad->Assign("MonitorSelfTime",            last_sample_time);
        ad->Assign("MonitorSelfCPUUsage",        cpu_usage);
        ad->Assign("MonitorSelfImageSize",       image_size);
        ad->Assign("MonitorSelfResidentSetSize", rs_size);
        ad->Assign("MonitorSelfAge",             age);
        ad->Assign("MonitorSelfRegisteredSocketCount", registered_socket_count);
        ad->Assign("MonitorSelfSecuritySessions", cached_security_sessions);
        ad->Assign(ATTR_DETECTED_CPUS, param_integer("DETECTED_CORES", 0));
        ad->Assign(ATTR_DETECTED_MEMORY, param_integer("DETECTED_MEMORY", 0));

        success = true;
    }

    return success;
}
void
HibernationManager::update( void )
{
	int previous_inteval = m_interval;
	m_interval = param_integer ( "HIBERNATE_CHECK_INTERVAL",
		0 /* default */, 0 /* min; no max */ );	
	bool change = ( previous_inteval != m_interval );
	if ( change ) {
		dprintf ( D_ALWAYS, "HibernationManager: Hibernation is %s\n",
			( m_interval > 0 ? "enabled" : "disabled" ) );
	}
	if ( m_hibernator ) {
		m_hibernator->update( );
	}
}
Exemple #9
0
void
MachAttributes::credd_test()
{
	// Attempt to perform a NOP on our CREDD_HOST. This will test
	// our ability to authenticate with DAEMON-level auth, and thus
	// fetch passwords. If we succeed, we'll advertise the CREDD_HOST

	char *credd_host = param("CREDD_HOST");

	if (credd_host == NULL) {
		if (m_local_credd != NULL) {
			free(m_local_credd);
			m_local_credd = NULL;
		}
		return;
	}

	if (m_local_credd != NULL) {
		if (strcmp(m_local_credd, credd_host) == 0) {
			free(credd_host);
			return;
		}
		else {
			free(m_local_credd);
			m_local_credd = NULL;
			m_last_credd_test = 0;
		}
	}

	time_t now = time(NULL);
	double thresh = (double)param_integer("CREDD_TEST_INTERVAL", 300);
	if (difftime(now, m_last_credd_test) > thresh) {
		Daemon credd(DT_CREDD);
		if (credd.locate()) {
			Sock *sock = credd.startCommand(CREDD_NOP, Stream::reli_sock, 20);
			if (sock != NULL) {
				sock->decode();
				if (sock->end_of_message()) {
					m_local_credd = credd_host;
				}
			}
		}
		m_last_credd_test = now;
	}
	if (credd_host != m_local_credd) {
		free(credd_host);
	}
}
Exemple #10
0
void NordugridJobReconfig()
{
	int tmp_int;

	tmp_int = param_integer( "GRIDMANAGER_GAHP_CALL_TIMEOUT", 5 * 60 );
	NordugridJob::setGahpCallTimeout( tmp_int );

	// Tell all the resource objects to deal with their new config values
	NordugridResource *next_resource;

	NordugridResource::ResourcesByName.startIterations();

	while ( NordugridResource::ResourcesByName.iterate( next_resource ) != 0 ) {
		next_resource->Reconfig();
	}
}
Exemple #11
0
void ScheddStatistics::Reconfig()
{
    this->RecentWindowMax = param_integer("STATISTICS_WINDOW_SECONDS", 1200, 
                                          schedd_stats_window_quantum, INT_MAX);
    this->PublishFlags    = IF_BASICPUB | IF_RECENTPUB;
    char * tmp = param("STATISTICS_TO_PUBLISH");
    if (tmp) {
       this->PublishFlags = generic_stats_ParseConfigString(tmp, "SCHEDD", "SCHEDULER", this->PublishFlags);
       free(tmp);
    }
    SetWindowSize(this->RecentWindowMax);

    //stats_histogram_sizes::init_sizes_from_param("MAX_HIST_SIZES_LEVELS");
    //JobSizes.reconfig();
    //JobSizesGoodput.reconfig();
    //JobSizesBadput.reconfig();
}
HibernatorBase::SLEEP_STATE 
UserDefinedToolsHibernator::enterState ( HibernatorBase::SLEEP_STATE state ) const
{
	
	/** Make sure a tool for this sleep state has been defined */
	unsigned index = sleepStateToInt ( state );

	if ( NULL == m_tool_paths[index] ) {
		dprintf ( 
			D_FULLDEBUG, 
			"Hibernator::%s tool not configured.\n",
			HibernatorBase::sleepStateToString ( state ) );
		return HibernatorBase::NONE;
	}

	/** Tell DaemonCore to register the process family so we can
		safely kill everything from the reaper */
	FamilyInfo fi;
	fi.max_snapshot_interval = param_integer (
		"PID_SNAPSHOT_INTERVAL", 
		15 );

	/** Run the user tool */
	int pid = daemonCore->Create_Process (
		m_tool_paths[index], 
		m_tool_args[index], 
		PRIV_CONDOR_FINAL,
		m_reaper_id, 
		FALSE, 
		FALSE, 
		NULL, 
		NULL, 
		&fi );	

	if ( FALSE == pid ) {
		dprintf ( 
			D_ALWAYS, 
			"UserDefinedToolsHibernator::enterState: Create_Process() "
			"failed\n" );
		return HibernatorBase::NONE;
	}

	return state;

}
Exemple #13
0
void BaseResource::UnregisterJob( BaseJob *job )
{
	CancelSubmit( job );

	pingRequesters.Delete( job );
	registeredJobs.Delete( job );
	leaseUpdates.Delete( job );

	if ( IsEmpty() ) {
		int delay = param_integer( "GRIDMANAGER_EMPTY_RESOURCE_DELAY", 5*60 );
		if ( delay < 0 ) {
			delay = 0;
		}
		deleteMeTid = daemonCore->Register_Timer( delay,
								(TimerHandlercpp)&BaseResource::DeleteMe,
								"BaseResource::DeleteMe", (Service*)this );
	}
}
Exemple #14
0
BaseResource::BaseResource( const char *resource_name )
{
	resourceName = strdup( resource_name );
	deleteMeTid = TIMER_UNSET;

	resourceDown = false;
	firstPingDone = false;
	pingActive = false;
	pingTimerId = daemonCore->Register_Timer( 0,
								(TimerHandlercpp)&BaseResource::Ping,
								"BaseResource::Ping", (Service*)this );
	lastPing = 0;
	lastStatusChange = 0;

	jobLimit = DEFAULT_MAX_SUBMITTED_JOBS_PER_RESOURCE;

	hasLeases = false;
	updateLeasesTimerId = daemonCore->Register_Timer( 0,
								(TimerHandlercpp)&BaseResource::UpdateLeases,
								"BaseResource::UpdateLeases", (Service*)this );
	lastUpdateLeases = 0;
	updateLeasesActive = false;
	updateLeasesCmdActive = false;
	m_hasSharedLeases = false;
	m_defaultLeaseDuration = -1;
	m_sharedLeaseExpiration = 0;

	_updateCollectorTimerId = daemonCore->Register_Timer ( 
		0,
		(TimerHandlercpp)&BaseResource::UpdateCollector,
		"BaseResource::UpdateCollector",
		(Service*)this );
	_lastCollectorUpdate = 0;
	_firstCollectorUpdate = true;
	_collectorUpdateInterval = param_integer ( 
		"GRIDMANAGER_COLLECTOR_UPDATE_INTERVAL", 5*60 );

	m_batchStatusActive = false;
	m_batchPollTid = TIMER_UNSET;

	m_paramJobPollRate = -1;
	m_paramJobPollInterval = -1;
	m_jobPollInterval = 0;
}
Exemple #15
0
int countTypes( int max_types, int num_cpus, int** array_ptr, bool except )
{
	int i, num=0, num_set=0;
    MyString param_name;
    MyString cruft_name;
	int* my_type_nums = new int[max_types];

	if( ! array_ptr ) {
		EXCEPT( "ResMgr:countTypes() called with NULL array_ptr!" );
	}

		// Type 0 is special, user's shouldn't define it.
	_checkInvalidParam("NUM_SLOTS_TYPE_0", except);
		// CRUFT
	_checkInvalidParam("NUM_VIRTUAL_MACHINES_TYPE_0", except);

	for( i=1; i<max_types; i++ ) {
		param_name.formatstr("NUM_SLOTS_TYPE_%d", i);
		if (param_boolean("ALLOW_VM_CRUFT", false)) {
			cruft_name.formatstr("NUM_VIRTUAL_MACHINES_TYPE_%d", i);
			my_type_nums[i] = param_integer(param_name.Value(),
											 param_integer(cruft_name.Value(),
														   0));
		} else {
			my_type_nums[i] = param_integer(param_name.Value(), 0);
		}
		if (my_type_nums[i]) {
			num_set = 1;
			num += my_type_nums[i];
		}
	}

	if( num_set ) {
			// We found type-specific stuff, use that.
		my_type_nums[0] = 0;
	} else {
			// We haven't found any special types yet.  Therefore,
			// we're evenly dividing things, so we only have to figure
			// out how many nodes to advertise.
		if (param_boolean("ALLOW_VM_CRUFT", false)) {
			my_type_nums[0] = param_integer("NUM_SLOTS",
										  param_integer("NUM_VIRTUAL_MACHINES",
														num_cpus));
		} else {
			my_type_nums[0] = param_integer("NUM_SLOTS", num_cpus);
		}
		num = my_type_nums[0];
	}
	*array_ptr = my_type_nums;
	return num;
}
Exemple #16
0
void
CCBListener::InitAndReconfig()
{
	int new_heartbeat_interval = param_integer("CCB_HEARTBEAT_INTERVAL",1200,0);
	if( new_heartbeat_interval != m_heartbeat_interval ) {
		if( new_heartbeat_interval < 30 && new_heartbeat_interval > 0 ) {
			new_heartbeat_interval = 30;
				// CCB server doesn't expect a high rate of unsolicited
				// input from us
			dprintf(D_ALWAYS,
					"CCBListener: using minimum heartbeat interval of %ds\n",
					new_heartbeat_interval);
		}
		m_heartbeat_interval = new_heartbeat_interval;
		if( m_heartbeat_initialized ) {
			RescheduleHeartbeat();
		}
	}
}
Exemple #17
0
void GlobusResource::Reconfig()
{
	int tmp_int;

	BaseResource::Reconfig();

	gahp->setTimeout( gahpCallTimeout );

	tmp_int = param_integer( "GRIDMANAGER_MAX_JOBMANAGERS_PER_RESOURCE",
							 DEFAULT_MAX_JOBMANAGERS_PER_RESOURCE );
	if ( tmp_int == 0 ) {
		submitJMLimit = GM_RESOURCE_UNLIMITED;
		restartJMLimit = GM_RESOURCE_UNLIMITED;
	} else {
		if ( tmp_int < 2 ) {
			tmp_int = 2;
		}
		submitJMLimit = tmp_int / 2;
		restartJMLimit = tmp_int - submitJMLimit;
	}

	// If the jobmanager limits were widened, move jobs from Wanted to
	// Allowed and signal them
	while ( ( submitJMsAllowed.Length() + restartJMsAllowed.Length() <
			  submitJMLimit + restartJMLimit ) &&
			( submitJMsWanted.Length() != 0 ||
			  restartJMsWanted.Length() != 0 ) ) {
		JMComplete( NULL );
	}

	if ( enableGridMonitor ) {
		// start grid monitor
		daemonCore->Reset_Timer( checkMonitorTid, 0 );
	} else {
		// stop grid monitor
		if ( monitorActive || monitorStarting ) {
			StopMonitor();
		}

		daemonCore->Reset_Timer( checkMonitorTid, TIMER_NEVER );
	}
}
Exemple #18
0
Timeslice &DCCollector::getBlacklistTimeslice()
{
	std::map< std::string, Timeslice >::iterator itr;
	itr = blacklist.find(addr());
	if( itr == blacklist.end() ) {
		Timeslice ts;
		
			// Blacklist this collector if last failed contact took more
			// than 1% of the time that has passed since that operation
			// started.  (i.e. if contact fails quickly, don't worry, but
			// if it takes a long time to fail, be cautious.
		ts.setTimeslice(0.01);
			// Set an upper bound of one hour for the collector to be blacklisted.
		int avoid_time = param_integer("DEAD_COLLECTOR_MAX_AVOIDANCE_TIME",3600);
		ts.setMaxInterval(avoid_time);
		ts.setInitialInterval(0);

		itr = blacklist.insert( std::map< std::string, Timeslice >::value_type(addr(),ts) ).first;
	}
	return itr->second;
}
int CollectorEngine::
scheduleHousekeeper (int timeout)
{
	// Are we filtering updates that we forward to the view collector?
	std::string watch_list;
	param(watch_list,"COLLECTOR_FORWARD_WATCH_LIST", "State,Cpus,Memory,IdleJobs,ClaimId,Capability,ClaimIdList,ChildClaimIds");
	m_forwardWatchList.clearAll();
	m_forwardWatchList.initializeFromString(watch_list.c_str());

	m_forwardFilteringEnabled = param_boolean( "COLLECTOR_FORWARD_FILTERING", false );

	// cancel outstanding housekeeping requests
	if (housekeeperTimerID != -1)
	{
		(void) daemonCore->Cancel_Timer(housekeeperTimerID);
	}

	// reset for new timer
	if (timeout < 0)
		return 0;

	// set to new timeout interval
	machineUpdateInterval = timeout;

	m_forwardInterval = param_integer("COLLECTOR_FORWARD_INTERVAL", machineUpdateInterval / 3, 0);

	// if timeout interval was non-zero (i.e., housekeeping required) ...
	if (timeout > 0)
	{
		// schedule housekeeper
		housekeeperTimerID = daemonCore->Register_Timer(machineUpdateInterval,
						machineUpdateInterval,
						(TimerHandlercpp)&CollectorEngine::housekeeper,
						"CollectorEngine::housekeeper",this);
		if (housekeeperTimerID == -1)
			return 0;
	}

	return 1;
}
Exemple #20
0
void BaseJob::BaseJobReconfig()
{
    int tmp_int;

    if ( periodicPolicyEvalTid != TIMER_UNSET ) {
        daemonCore->Cancel_Timer( periodicPolicyEvalTid );
        periodicPolicyEvalTid = TIMER_UNSET;
    }

    tmp_int = param_integer( "PERIODIC_EXPR_INTERVAL", 300 );
    if ( tmp_int > 0 ) {
        periodicPolicyEvalTid = daemonCore->Register_Timer( tmp_int, tmp_int,
                                BaseJob::EvalAllPeriodicJobExprs,
                                "EvalAllPeriodicJobExprs" );
    }

    if ( m_checkRemoteStatusTid == TIMER_UNSET ) {
        m_checkRemoteStatusTid = daemonCore->Register_Timer( 5, 60,
                                 BaseJob::CheckAllRemoteStatus,
                                 "BaseJob::CheckAllRemoteStatus" );
    }
}
Exemple #21
0
void 
VMUniverseMgr::docheckVMUniverse(void)
{
	char *vm_type = param( "VM_TYPE" );
	dprintf( D_ALWAYS, "VM universe will be tested "
			"to check if it is available\n");
	if( init() == false && vm_type ) {
		// VM universe is desired, but not available

		// In VMware, some errors may be transient.
		// For example, when VMware fails to start a new VM 
		// due to an incorrect config file, we are unable to 
		// run 'vmrun' command for a while.
		// But after some time, we are able to run it again.
		// So we register a timer to call this function later.
		m_check_interval = param_integer("VM_RECHECK_INTERVAL", 600);
		if ( m_check_tid >= 0 ) {
			daemonCore->Reset_Timer( m_check_tid, m_check_interval );
		} else {
			m_check_tid = daemonCore->Register_Timer(m_check_interval,
				(TimerHandlercpp)&VMUniverseMgr::docheckVMUniverse,
				"VMUniverseMgr::docheckVMUniverse", this);
		}
		dprintf( D_ALWAYS, "Started a timer to test VM universe after "
			   "%d(seconds)\n", m_check_interval);
	} else {
		if ( m_check_tid >= 0 ) {
			daemonCore->Cancel_Timer( m_check_tid );
			m_check_tid = -1;

			// in the case where we had to use the timer,
			// make certain we publish our changes.  
			if( resmgr ) {
				resmgr->eval_and_update_all();
			}	
		}
	}
	free( vm_type );
}
Exemple #22
0
void
CCBListener::Disconnected()
{
	if( m_sock ) {
		daemonCore->Cancel_Socket( m_sock );
		delete m_sock;
		m_sock = NULL;
	}

	if( m_waiting_for_connect ) {
		m_waiting_for_connect = false;
		decRefCount();
	}

	m_waiting_for_registration = false;
	m_registered = false;

	StopHeartbeat();

	if( m_reconnect_timer != -1 ) {
		return; // already in progress
	}

	int reconnect_time = param_integer("CCB_RECONNECT_TIME",60);

	dprintf(D_ALWAYS,
			"CCBListener: connection to CCB server %s failed; "
			"will try to reconnect in %d seconds.\n",
			m_ccb_address.Value(), reconnect_time);

	m_reconnect_timer = daemonCore->Register_Timer(
		reconnect_time,
		(TimerHandlercpp)&CCBListener::ReconnectTime,
		"CCBListener::ReconnectTime",
		this );

	ASSERT( m_reconnect_timer != -1 );
}
void
Reconfig()
{
	contact_schedd_interval = 
		param_integer ("C_GAHP_CONTACT_SCHEDD_DELAY", 5);

		// When GSI authentication is used, we're willing to trust schedds
		// which have the same credential as the job
	if ( proxySubjectName ) {
		char *daemon_subjects = param( "GSI_DAEMON_NAME" );
		if ( daemon_subjects ) {
			std::string buff;
			formatstr( buff, "%s,%s", daemon_subjects, proxySubjectName );
			dprintf( D_ALWAYS, "Setting %s=%s\n", "GSI_DAEMON_NAME",
					 buff.c_str() );
				// We must use our daemon subsystem prefix in case the
				// admin used it in the config file.
			config_insert( "C_GAHP_WORKER_THREAD.GSI_DAEMON_NAME",
						   buff.c_str() );
			free( daemon_subjects );
		}
	}
}
ResourceRequestList::ResourceRequestList(int protocol_version)
	: m_send_end_negotiate(false),
	m_send_end_negotiate_now(false),
	m_requests_to_fetch(0)
{
	m_protocol_version = protocol_version;
	m_clear_rejected_autoclusters = false;
	m_use_resource_request_counts = param_boolean("USE_RESOURCE_REQUEST_COUNTS",true);
	if ( protocol_version == 0 || m_use_resource_request_counts == false ) {
		// Protocol version is 0, and schedd resource request lists were introduced
		// in protocol version 1.  so set m_num_to_fetch to 1 so we use the old
		// protocol of getting one request at a time with this old schedd.
		// Also we must use the old protocol if admin disabled USE_RESOURCE_REQUEST_COUNTS,
		// since the new protocol relies on that.
		m_num_to_fetch = 1;
	} else {
		m_num_to_fetch = param_integer("NEGOTIATOR_RESOURCE_REQUEST_LIST_SIZE");
	}
	errcode = 0;
	current_autocluster = -1;
	resource_request_count = 0;
	resource_request_offers = 0;
}
Exemple #25
0
void
BaseShadow::checkSwap( void )
{
	int	reserved_swap, free_swap;
		// Reserved swap is specified in megabytes
	reserved_swap = param_integer( "RESERVED_SWAP", 0 );
	reserved_swap *= 1024;

	if( reserved_swap == 0 ) {
			// We're not supposed to care about swap space at all, so
			// none of the rest of the checks matter at all.
		return;
	}

	free_swap = sysapi_swap_space();

	dprintf( D_FULLDEBUG, "*** Reserved Swap = %d\n", reserved_swap );
	dprintf( D_FULLDEBUG, "*** Free Swap = %d\n", free_swap );

	if( free_swap < reserved_swap ) {
		dprintf( D_ALWAYS, "Not enough reserved swap space\n" );
		DC_Exit( JOB_NO_MEM );
	}
}	
Exemple #26
0
void Defrag::config()
{

	ASSERT( param(m_state_file,"DEFRAG_STATE_FILE") );
	if( m_last_poll==0 ) {
		loadState();
	}

	int old_polling_interval = m_polling_interval;
	m_polling_interval = param_integer("DEFRAG_INTERVAL",600);
	if( m_polling_interval <= 0 ) {
		dprintf(D_ALWAYS,
				"DEFRAG_INTERVAL=%d, so no pool defragmentation "
				"will be done.\n", m_polling_interval);
		if( m_polling_timer != -1 ) {
			daemonCore->Cancel_Timer(m_polling_timer);
			m_polling_timer = -1;
		}
	}
	else if( m_polling_timer >= 0 ) {
		if( old_polling_interval != m_polling_interval ) {
			daemonCore->Reset_Timer_Period(
				m_polling_timer,
				m_polling_interval);
		}
	}
	else {
		time_t now = time(NULL);
		int first_time = 0;
		if( m_last_poll != 0 && now-m_last_poll < m_polling_interval && m_last_poll <= now ) {
			first_time = m_polling_interval - (now-m_last_poll);
		}
		m_polling_timer = daemonCore->Register_Timer(
			first_time,
			m_polling_interval,
			(TimerHandlercpp)&Defrag::poll,
			"Defrag::poll",
			this );
	}
	if( old_polling_interval != m_polling_interval && m_polling_interval > 0 )
	{
		dprintf(D_ALWAYS,
				"Will evaluate defragmentation policy every DEFRAG_INTERVAL="
				"%d seconds.\n", m_polling_interval);
	}

	m_draining_per_hour = param_double("DEFRAG_DRAINING_MACHINES_PER_HOUR",0,0);

	double rate = m_draining_per_hour/3600.0*m_polling_interval;
	m_draining_per_poll = (int)floor(rate + 0.00001);
	if( m_draining_per_poll < 0 ) m_draining_per_poll = 0;

	double error_per_hour = (rate - m_draining_per_poll)/m_polling_interval*3600.0;
	m_draining_per_poll_hour = (int)floor(error_per_hour + 0.00001);
	if( m_draining_per_hour < 0 || m_polling_interval > 3600 ) {
		m_draining_per_hour = 0;
	}

	double error_per_day = (error_per_hour - m_draining_per_poll_hour)*24.0;
	m_draining_per_poll_day = (int)floor(error_per_day + 0.5);
	if( m_draining_per_poll_day < 0 || m_polling_interval > 3600*24 ) {
		m_draining_per_poll_day = 0;
	}
	dprintf(D_ALWAYS,"polling interval %ds, DEFRAG_DRAINING_MACHINES_PER_HOUR = %f/hour = %d/interval + %d/hour + %d/day\n",
			m_polling_interval,m_draining_per_hour,m_draining_per_poll,
			m_draining_per_poll_hour,m_draining_per_poll_day);

	m_max_draining = param_integer("DEFRAG_MAX_CONCURRENT_DRAINING",-1,-1);
	m_max_whole_machines = param_integer("DEFRAG_MAX_WHOLE_MACHINES",-1,-1);

	ASSERT( param(m_defrag_requirements,"DEFRAG_REQUIREMENTS") );
	validateExpr( m_defrag_requirements.c_str(), "DEFRAG_REQUIREMENTS" );

	ASSERT( param(m_whole_machine_expr,"DEFRAG_WHOLE_MACHINE_EXPR") );
	validateExpr( m_whole_machine_expr.c_str(), "DEFRAG_WHOLE_MACHINE_EXPR" );

	ASSERT( param(m_draining_schedule_str,"DEFRAG_DRAINING_SCHEDULE") );
	if( m_draining_schedule_str.empty() ) {
		m_draining_schedule = DRAIN_GRACEFUL;
		m_draining_schedule_str = "graceful";
	}
	else {
		m_draining_schedule = getDrainingScheduleNum(m_draining_schedule_str.c_str());
		if( m_draining_schedule < 0 ) {
			EXCEPT("Invalid draining schedule: %s\n",m_draining_schedule_str.c_str());
		}
	}

	MyString rank;
	param(rank,"DEFRAG_RANK");
	if( rank.IsEmpty() ) {
		m_rank_ad.Delete(ATTR_RANK);
	}
	else {
		if( !m_rank_ad.AssignExpr(ATTR_RANK,rank.Value()) ) {
			EXCEPT("Invalid expression for DEFRAG_RANK: %s\n",
				   rank.Value());
		}
	}

	int update_interval = param_integer("DEFRAG_UPDATE_INTERVAL", 600);
	if(m_public_ad_update_interval != update_interval) {
		m_public_ad_update_interval = update_interval;

		dprintf(D_FULLDEBUG, "Setting update interval to %d\n",
			m_public_ad_update_interval);

		if(m_public_ad_update_timer >= 0) {
			daemonCore->Reset_Timer_Period(
				m_public_ad_update_timer,
				m_public_ad_update_interval);
		}
		else {
			m_public_ad_update_timer = daemonCore->Register_Timer(
				0,
				m_public_ad_update_interval,
				(TimerHandlercpp)&Defrag::updateCollector,
				"Defrag::updateCollector",
				this);
		}
	}

	if (param(m_cancel_requirements, "DEFRAG_CANCEL_REQUIREMENTS")) {
		validateExpr( m_cancel_requirements.c_str(), "DEFRAG_CANCEL_REQUIREMENTS" );
	} else {
		m_cancel_requirements = "";
	}

	param(m_defrag_name,"DEFRAG_NAME");

	int stats_quantum = m_polling_interval;
	int stats_window = 10*stats_quantum;
	m_stats.SetWindowSize(stats_window,stats_quantum);
}
Exemple #27
0
void
init_params()
{
	char	*tmp;
	static	int	master_name_in_config = 0;

	if( ! master_name_in_config ) {
			// First time, or we know it's not in the config file. 
		if( ! MasterName ) {
				// Not set on command line
			tmp = param( "MASTER_NAME" );
			if( tmp ) {
				MasterName = build_valid_daemon_name( tmp );
				master_name_in_config = 1;
				free( tmp );
			} 
		}
	} else {
		delete [] MasterName;
		tmp = param( "MASTER_NAME" );
		MasterName = build_valid_daemon_name( tmp );
		free( tmp );
	}
	if( MasterName ) {
		dprintf( D_FULLDEBUG, "Using name: %s\n", MasterName );
	}
			
	if (!param_boolean_crufty("START_MASTER", true)) {
			dprintf( D_ALWAYS, "START_MASTER was set to FALSE, shutting down.\n" );
			StartDaemons = FALSE;
			main_shutdown_graceful();
	}

		
	StartDaemons = TRUE;
	if (!param_boolean_crufty("START_DAEMONS", true)) {
			dprintf( D_ALWAYS, 
					 "START_DAEMONS flag was set to FALSE.  Not starting daemons.\n" );
			StartDaemons = FALSE;
	} 
		// If we were sent the daemons_off command, don't forget that
		// here. 
	if( GotDaemonsOff ) {
		StartDaemons = FALSE;
	}

	PublishObituaries = param_boolean_crufty("PUBLISH_OBITUARIES", true) ? TRUE : FALSE;

	Lines = param_integer("OBITUARY_LOG_LENGTH",20);

	master_backoff_constant = param_integer( "MASTER_BACKOFF_CONSTANT", 9, 1 );

	master_backoff_ceiling = param_integer( "MASTER_BACKOFF_CEILING", 3600,1 );

	master_backoff_factor = param_double( "MASTER_BACKOFF_FACTOR", 2.0, 0 );
	if( master_backoff_factor <= 0.0 ) {
    	master_backoff_factor = 2.0;
    }
	
	master_recover_time = param_integer( "MASTER_RECOVER_FACTOR", 300, 1 );

	update_interval = param_integer( "MASTER_UPDATE_INTERVAL", 5 * MINUTE, 1 );

	check_new_exec_interval = param_integer( "MASTER_CHECK_NEW_EXEC_INTERVAL", 5*MINUTE );

	new_bin_delay = param_integer( "MASTER_NEW_BINARY_DELAY", 2*MINUTE, 1 );

	new_bin_restart_mode = GRACEFUL;
	char * restart_mode = param("MASTER_NEW_BINARY_RESTART");
	if (restart_mode) {
#if 1
		StopStateT mode = StringToStopState(restart_mode);
#else
		static const struct {
			const char * text;
			StopStateT   mode;
			} modes[] = {
				{ "GRACEFUL", GRACEFUL },
				{ "PEACEFUL", PEACEFUL },
				{ "NEVER", NONE }, { "NONE", NONE }, { "NO", NONE },
			//	{ "FAST", FAST },
			//	{ "KILL", KILL },
			};
		StopStateT mode = (StopStateT)-1; // prime with -1 so we can detect bad input.
		for (int ii = 0; ii < (int)COUNTOF(modes); ++ii) {
			if (MATCH == strcasecmp(restart_mode, modes[ii].text)) {
				mode = modes[ii].mode;
				break;
			}
		}
#endif
		if (mode == (StopStateT)-1)	{
			dprintf(D_ALWAYS, "%s is not a valid value for MASTER_NEW_BINARY_RESTART. using GRACEFUL\n", restart_mode);
		}
		if (mode >= 0 && mode <= NONE)
			new_bin_restart_mode = mode;
		free(restart_mode);
	}

	preen_interval = param_integer( "PREEN_INTERVAL", 24*HOUR, 0 );
	if(preen_interval == 0) {
		EXCEPT("PREEN_INTERVAL in the condor configuration is too low (0).  Please set it to an integer in the range 1 to %d (default %d).  To disable condor_preen entirely, comment out PREEN.", INT_MAX, 24*HOUR);

	}

	shutdown_fast_timeout = param_integer( "SHUTDOWN_FAST_TIMEOUT", 5*MINUTE, 1 );

	shutdown_graceful_timeout = param_integer( "SHUTDOWN_GRACEFUL_TIMEOUT", 30*MINUTE, 1 );

	AllowAdminCommands = param_boolean( "ALLOW_ADMIN_COMMANDS", true );

	if( FS_Preen ) {
		free( FS_Preen );
	}
	FS_Preen = param( "PREEN" );
}
Exemple #28
0
GlobusResource::ReadFileStatus
GlobusResource::ReadMonitorJobStatusFile()
{
	// return true if file successfully processed and jobs notified,
	// else return false. 
	// TODO should distinguish between temporary and permanent problems.
	//   e.g. if file is incomplete, retry after short delay
	FILE *fp;
	char buff[1024];
	char contact[1024];
	int status;
	int scan_start = 0;
	int scan_finish = 0;
	int job_count = 0;

	if(monitorJobStatusFile == NULL) {
		EXCEPT("Consistency problem for GlobusResource::ReadMonitorJobStatusFile %s, null job status file name", resourceName);
	}

	fp = safe_fopen_wrapper_follow( monitorJobStatusFile, "r" );
	if ( fp == NULL ) {
		dprintf( D_ALWAYS, "Failed to open grid_monitor job status file %s\n",
				 monitorJobStatusFile );
		return RFS_ERROR;
	}

	if ( fgets( buff, sizeof(buff), fp ) == NULL ) {
		if( feof(fp) ) {
			dprintf( D_FULLDEBUG, "grid_monitor job status file empty (%s), treating as partial.\n",
					 monitorJobStatusFile );
			fclose( fp );
			return RFS_PARTIAL;
		}
		dprintf( D_ALWAYS, "Can't read grid_monitor job status file %s\n",
				 monitorJobStatusFile );
		fclose( fp );
		return RFS_ERROR;
	}
	if ( sscanf( buff, "%d %d", &scan_start, &scan_finish ) != 2 ) {
		dprintf( D_ALWAYS, "Failed to read scan times from grid_monitor "
				 "status file %s\n", monitorJobStatusFile );
		fclose( fp );
		return RFS_ERROR;
	}

	bool found_eof = false;
	while ( fgets( buff, sizeof(buff), fp ) != NULL ) {
		contact[0] = '\0';
		status = 0;

		const char * MAGIC_EOF = "GRIDMONEOF";
		if(strncmp(buff, MAGIC_EOF, strlen(MAGIC_EOF)) == 0) {
			found_eof = true;
			break;
		}

		if ( sscanf( buff, "%s %d", contact, &status ) == 2 &&
			 *contact != '\0' && status > 0 ) {
			int rc;
			GlobusJob *job = NULL;

			job_count++;

			rc = JobsByContact.lookup( HashKey( globusJobId(contact) ), job );
			if ( rc == 0 && job != NULL ) {
				if ( status == GLOBUS_GRAM_PROTOCOL_JOB_STATE_DONE ) {
					status=GLOBUS_GRAM_PROTOCOL_JOB_STATE_STAGE_OUT;
				}
					// Don't flood the log file
					// with a long stream of identical job status updates.
					// We do need to send identical job status updates to
					// the job so that it can track the last time we
					// received an update on its status.
				if ( status != job->globusState ) {
					dprintf(D_FULLDEBUG,"Sending callback of %d to %d.%d (%s)\n",status,job->procID.cluster,job->procID.proc, resourceName);
				}
				job->GramCallback( status, 0 );
			}
		}
	}

	fclose( fp );

	int limit = param_integer( "GRID_MONITOR_NO_STATUS_TIMEOUT", 15*60 );
	int now = time(NULL);
	GlobusJob *next_job;
	registeredJobs.Rewind();
	while ( (next_job = (GlobusJob *)registeredJobs.Next()) != NULL ) {
		if ( next_job->jobContact &&
			 now > next_job->lastRemoteStatusUpdate + limit ) {
			next_job->SetEvaluateState();
		}
	}

	dprintf( D_FULLDEBUG, "Read %s grid_monitor status file for %s: "
			 "scan start=%d, scan finish=%d, job count=%d\n",
			 found_eof ? "full" : "partial",
			 resourceName, scan_start, scan_finish, job_count );

	if(found_eof)
		return RFS_OK;
	return RFS_PARTIAL;
}
Exemple #29
0
void
GlobusResource::CheckMonitor()
{
	BaseJob *base_job = NULL;
	GlobusJob *job;
	// TODO should we require our jobs to request the grid monitor before
	//   we'll start it up?
	// TODO what if we're in the middle of a ping when we get here? Either
	//   delay until ping is done or have seperate GahpClient
	// TODO if resource is down, should we delay any actions?
	daemonCore->Reset_Timer( checkMonitorTid, TIMER_NEVER );
	dprintf(D_FULLDEBUG, "grid_monitor for %s entering CheckMonitor\n",
		resourceName);

	if ( m_versionKnown && m_isGt5 ) {
		dprintf( D_FULLDEBUG, "Disabling grid_monitor for GRAM5 server %s\n",
				 resourceName );
		return;
	}

	if ( monitorGahp->isInitialized() == false ) {
		dprintf( D_ALWAYS, "GAHP server not initialized yet, not submitting "
				 "grid_monitor now\n" );
		daemonCore->Reset_Timer( checkMonitorTid, 5 );
		return;
	}

	if ( !enableGridMonitor ) {
		return;
	}

	if ( time(NULL) < monitorRetryTime ) {
		daemonCore->Reset_Timer( checkMonitorTid,
								 monitorRetryTime - time(NULL) );
		return;
	}

	if ( firstPingDone == false ) {
		dprintf(D_FULLDEBUG,"grid_monitor for %s: first ping not done yet, "
			"will retry later\n", resourceName);
		daemonCore->Reset_Timer( checkMonitorTid, 5 );
		return;
	}

	if ( monitorSubmitActive ) {
		int rc;
		std::string job_contact;
		monitorGahp->setMode( GahpClient::results_only );
		rc = monitorGahp->globus_gram_client_job_request( NULL, NULL, 0, NULL,
														  job_contact, false );
		if ( rc == GAHPCLIENT_COMMAND_NOT_SUBMITTED ||
			 rc == GAHPCLIENT_COMMAND_PENDING ) {
				// do nothing
		} else if ( rc == 0 ) {
				// successful submit
			monitorGramJobId = strdup( job_contact.c_str() );
			monitorSubmitActive = false;
		} else {
				// submit failed
			dprintf(D_ALWAYS, "grid_monitor job submit failed for resource %s, gram error %d (%s)\n",
					resourceName, rc,
					monitorGahp->globus_gram_client_error_string(rc));
			monitorSubmitActive = false;
			AbandonMonitor();
			return;
		}
	}

	if ( monitorActive == false && monitorStarting == false ) {
		monitorStarting = true;
		if ( SubmitMonitorJob() == true ) {
			daemonCore->Reset_Timer( checkMonitorTid, 30 );
		} else {
			dprintf(D_ALWAYS, "Unable to start grid_monitor for resource %s\n",
				resourceName);
			// TODO: Do nice retry?
			AbandonMonitor();
		}
	} else {
		int rc;
		struct stat file_status;
		int job_status_mod_time;
		int log_mod_time;

		if(monitorJobStatusFile == NULL) {
			EXCEPT("Consistency problem for GlobusResource %s, null job status file name", resourceName);
		}

		if(monitorLogFile == NULL) {
			EXCEPT("Consistency problem for GlobusResource %s, null monitor log file name", resourceName);
		}

		rc = stat( monitorJobStatusFile, &file_status );
		if ( rc < 0 ) {
			EXCEPT( "stat(%s) failed, errno=%d", monitorJobStatusFile, errno );
		}
		job_status_mod_time = file_status.st_mtime;

		rc = stat( monitorLogFile, &file_status );
		if ( rc < 0 ) {
			EXCEPT( "stat(%s) failed, errno=%d", monitorLogFile, errno );
		}
		log_mod_time = file_status.st_mtime;

		if ( job_status_mod_time > jobStatusFileLastReadTime ) {
			dprintf(D_FULLDEBUG, "grid_monitor job status for %s file has been refreshed.\n", resourceName);

			ReadFileStatus status = ReadMonitorJobStatusFile();
			if(status == RFS_OK) {
				dprintf(D_FULLDEBUG, "Read grid_monitor status file for %s successfully\n", resourceName);
				jobStatusFileLastReadTime = time(NULL);
				jobStatusFileLastUpdate = time(NULL);
				daemonCore->Reset_Timer( checkMonitorTid, 30 );

			} else if(status == RFS_PARTIAL) {
				const int RETRY_TIME = 10;
				dprintf(D_FULLDEBUG,"*** status file is partial, "
					"will try again in %d seconds\n", RETRY_TIME);
				daemonCore->Reset_Timer( checkMonitorTid, RETRY_TIME );

			} else if(status == RFS_ERROR) {
				dprintf(D_ALWAYS,"grid_monitor: error reading job status "
					"file for %s, stopping grid monitor\n", resourceName);
				// TODO: Try to restart monitor?
				AbandonMonitor();
				return;

			} else {
				EXCEPT("ReadMonitorJobStatusFile returned unexpected %d "
					"(for %s)", (int)status, resourceName);
			}

		}

		int log_file_timeout = param_integer("GRID_MONITOR_HEARTBEAT_TIMEOUT", 300);
		int monitor_retry_duration = param_integer("GRID_MONITOR_RETRY_DURATION", 900);

		if ( log_mod_time > logFileLastReadTime ) {

			dprintf(D_FULLDEBUG, "grid_monitor log file for %s updated.\n",
				resourceName);
			rc = ReadMonitorLogFile();

			switch( rc ) {
			case 0: // Normal / OK
				dprintf(D_FULLDEBUG,
					"grid_monitor log file for %s looks normal\n",
					resourceName);
				if ( monitorStarting ) {
					dprintf(D_ALWAYS, "Successfully started grid_monitor "
							"for %s\n", resourceName);
					monitorStarting = false;
					monitorFirstStartup = false;
					monitorActive = true;
					registeredJobs.Rewind();
					while ( registeredJobs.Next( base_job ) ) {
						job = dynamic_cast<GlobusJob*>( base_job );
						job->SetEvaluateState();
					}
				}
				logFileLastReadTime = time(NULL);
				daemonCore->Reset_Timer( checkMonitorTid, 30 );
				break;

			case 1: // Exitted normally (should restart)
				dprintf(D_FULLDEBUG,
					"grid_monitor for %s reached maximum lifetime, "
					"restarting...\n", resourceName);
				if ( SubmitMonitorJob() == true ) {
					dprintf(D_FULLDEBUG,
						"grid_monitor for %s restarted.\n", resourceName);
					daemonCore->Reset_Timer( checkMonitorTid, 30 );
				} else {
					dprintf(D_ALWAYS, 
						"Unable to restart grid_monitor for resource %s\n",
						resourceName);
					// TODO: Try to restart monitor?
					AbandonMonitor();
				}
				break;

			case 2: // Exitted with error
				dprintf(D_ALWAYS,"Error with grid_monitor for %s, stopping.\n",
					resourceName);
				// TODO: Try to restart monitor?
				AbandonMonitor();
				break;

			default:
				EXCEPT( "Unknown return value %d from ReadLogFile", rc );

			}

		} else if ( time(NULL) > logFileLastReadTime + log_file_timeout &&
					!monitorStarting ) {
			dprintf( D_ALWAYS, "Haven't heard from running grid_monitor "
					 "at %s for %d seconds, trying new job submission\n",
					 resourceName, log_file_timeout );
			if( ! SubmitMonitorJob() ) {
				dprintf(D_ALWAYS, "Failed to restart grid_monitor.  Giving up on grid_monitor for site %s\n", resourceName);
				AbandonMonitor();
			}
			daemonCore->Reset_Timer( checkMonitorTid, 30);

		} else if ( monitorStarting &&
					time(NULL) > logFileLastReadTime + monitor_retry_duration) {
			dprintf( D_ALWAYS, "Haven't heard from new grid_monitor "
					 "at %s for %d seconds, giving up\n",
					 resourceName, monitor_retry_duration );
			AbandonMonitor();

		} else {
			daemonCore->Reset_Timer( checkMonitorTid, 30 );
		}
	}

	return;
}
Exemple #30
0
void BaseResource::Reconfig()
{
	int tmp_int;
	char *param_value;
	std::string param_name;

	tmp_int = param_integer( "GRIDMANAGER_RESOURCE_PROBE_INTERVAL", 5 * 60 );
	setProbeInterval( tmp_int );

	jobLimit = -1;
	formatstr( param_name, "GRIDMANAGER_MAX_SUBMITTED_JOBS_PER_RESOURCE_%s",
						ResourceType() );
	param_value = param( param_name.c_str() );
	if ( param_value == NULL ) {
		param_value = param( "GRIDMANAGER_MAX_SUBMITTED_JOBS_PER_RESOURCE" );
	}
	if ( param_value != NULL ) {
		char *tmp1;
		char *tmp2;
		StringList limits( param_value );
		limits.rewind();
		if ( limits.number() > 0 ) {
			jobLimit = atoi( limits.next() );
			while ( (tmp1 = limits.next()) && (tmp2 = limits.next()) ) {
				if ( strstr( resourceName, tmp1 ) != 0 ) {
					jobLimit = atoi( tmp2 );
				}
			}
		}
		free( param_value );
	}
	if ( jobLimit <= 0 ) {
		jobLimit = DEFAULT_MAX_SUBMITTED_JOBS_PER_RESOURCE;
	}

	// If the jobLimit was widened, move jobs from Wanted to Allowed and
	// signal them
	while ( submitsAllowed.Length() < jobLimit &&
			submitsWanted.Length() > 0 ) {
		BaseJob *wanted_job = submitsWanted.Head();
		submitsWanted.Delete( wanted_job );
		submitsAllowed.Append( wanted_job );
		wanted_job->SetEvaluateState();
	}

	formatstr( param_name, "GRIDMANAGER_JOB_PROBE_RATE_%s", ResourceType() );
	m_paramJobPollRate = param_integer( param_name.c_str(), -1 );
	if ( m_paramJobPollRate <= 0 ) {
		m_paramJobPollRate = param_integer( "GRIDMANAGER_JOB_PROBE_RATE",
											DEFAULT_JOB_POLL_RATE );
	}
	if ( m_paramJobPollRate <= 0 ) {
		m_paramJobPollRate = DEFAULT_JOB_POLL_RATE;
	}

	const char *legacy_job_poll_param = NULL;
	const char *type = ResourceType();
	if ( strcmp( type, "condor" ) == 0 ) {
		legacy_job_poll_param = "CONDOR_JOB_POLL_INTERVAL";
	} else if ( strcmp( type, "batch" ) == 0 ||
				strcmp( type, "pbs" ) == 0 ||
				strcmp( type, "lsf" ) == 0 ||
				strcmp( type, "nqs" ) == 0 ||
				strcmp( type, "sge" ) == 0 ||
				strcmp( type, "naregi" ) == 0 ) {
		legacy_job_poll_param = "INFN_JOB_POLL_INTERVAL";
	}

	formatstr( param_name, "GRIDMANAGER_JOB_PROBE_INTERVAL_%s", ResourceType() );
	m_paramJobPollInterval = param_integer( param_name.c_str(), -1 );
	if ( m_paramJobPollInterval <= 0 ) {
		m_paramJobPollInterval = param_integer( "GRIDMANAGER_JOB_PROBE_INTERVAL", -1 );
	}
	if ( m_paramJobPollInterval <= 0 && legacy_job_poll_param ) {
		m_paramJobPollInterval = param_integer( legacy_job_poll_param, -1 );
	}
	if ( m_paramJobPollInterval <= 0 ) {
		m_paramJobPollInterval = DEFAULT_JOB_POLL_INTERVAL;
	}

	SetJobPollInterval();

	_collectorUpdateInterval = param_integer ( 
		"GRIDMANAGER_COLLECTOR_UPDATE_INTERVAL", 5*60 );

}