void AvailStats::compute( amask_t how_much ) { if( !compute_avail_stats ) return; if( IS_STATIC(how_much) && IS_SHARED(how_much) ) { as_avail_confidence = param_double("STARTD_AVAIL_CONFIDENCE", as_avail_confidence, 0, 1); as_max_avail_periods = param_integer("STARTD_MAX_AVAIL_PERIOD_SAMPLES", as_max_avail_periods); } if( IS_UPDATE(how_much) && IS_SHARED(how_much) ) { time_t current_time = time(0); // only compute avail time estimate if we're in non-owner state if( as_start_avail ) { // first, skip over intervals less than our idle time so far int current_idle_time = current_time - as_start_avail; int num_intervals = as_num_avail_periods; as_avail_periods.Rewind(); int item = 0; as_avail_periods.Next(item); while( num_intervals && item < current_idle_time ) { as_avail_periods.Next(item); num_intervals--; } if( !num_intervals ) { // If this is the longest we've ever been idle, our // historical data isn't too useful to us, so give an // estimate based on how long we've been idle so far. as_avail_estimate = (int)floor(current_idle_time*(2.0-as_avail_confidence)); } else { // Otherwise, find the record in our history at the // requested confidence level. int idx = (int)floor(num_intervals*(1.0-as_avail_confidence)); while( idx ) { as_avail_periods.Next(item); idx--; } as_avail_estimate = item; } as_avail_time = float(as_tot_avail_time + current_idle_time) / float(current_time - as_birthdate); } else { as_avail_time = float(as_tot_avail_time) / float(current_time - as_birthdate); } } }
void StarterStatistics::Reconfig() { int quantum = param_integer("STATISTICS_WINDOW_QUANTUM_STARTER", INT_MAX, 1, INT_MAX); if (quantum >= INT_MAX) quantum = param_integer("STATISTICS_WINDOW_QUANTUM", 4*60, 1, INT_MAX); this->RecentWindowQuantum = quantum; int window = param_integer("STATISTICS_WINDOW_SECONDS_STARTER", INT_MAX, 1, INT_MAX); if (window >= INT_MAX) window = param_integer("STATISTICS_WINDOW_SECONDS", 1200, quantum, INT_MAX); this->RecentWindowMax = window; this->RecentWindowMax = window; Pool.SetRecentMax(window, this->RecentWindowQuantum); this->PublishFlags = IF_BASICPUB | IF_RECENTPUB; char* tmp = param("STATISTICS_TO_PUBLISH"); if (tmp) { this->PublishFlags = generic_stats_ParseConfigString(tmp, "STARTER", "_no_alternate_name_", this->PublishFlags); free(tmp); } }
void main_init(int argc, char *argv[]) { _EXCEPT_Cleanup = ExceptCleanup; /* Start up with condor.condor privileges. */ set_condor_priv(); // Register a do-nothing reaper. This is just because the // file transfer object, which could be instantiated later, // registers its own reaper and does an EXCEPT if it gets // a reaper ID of 1 (since lots of other daemons have a reaper // ID of 1 hard-coded as special... this is bad). daemonCore->Register_Reaper("dummy_reaper", (ReaperHandler)&dummy_reaper, "dummy_reaper",NULL); // register SIGUSR1 (condor_rm) for shutdown... daemonCore->Register_Signal( SIGUSR1, "SIGUSR1", (SignalHandler)&handleSignals,"handleSignals"); // register UPDATE_JOBAD for qedit changes daemonCore->Register_Signal( UPDATE_JOBAD, "UPDATE_JOBAD", (SignalHandler)&handleSignals,"handleSignals"); // handle daemoncore signals which are passed down daemonCore->Register_Signal( DC_SIGSUSPEND, "DC_SIGSUSPEND", (SignalHandler)&handleSignals,"handleSignals"); daemonCore->Register_Signal( DC_SIGCONTINUE, "DC_SIGCONTINUE", (SignalHandler)&handleSignals,"handleSignals"); int shadow_worklife = param_integer( "SHADOW_WORKLIFE", 3600 ); if( shadow_worklife > 0 ) { shadow_worklife_expires = time(NULL) + shadow_worklife; } else if( shadow_worklife == 0 ) { // run one job and then exit shadow_worklife_expires = time(NULL)-1; } else { shadow_worklife_expires = 0; } parseArgs( argc, argv ); CheckSpoolVersion(SPOOL_MIN_VERSION_SHADOW_SUPPORTS,SPOOL_CUR_VERSION_SHADOW_SUPPORTS); ClassAd* ad = readJobAd(); if( ! ad ) { EXCEPT( "Failed to read job ad!" ); } startShadow( ad ); }
// Initialize to read the global event log bool ReadUserLog::initialize( void ) { char *path = param( "EVENT_LOG" ); if ( NULL == path ) { Error( LOG_ERROR_FILE_NOT_FOUND,__LINE__ ); return false; } int max_rotations = param_integer( "EVENT_LOG_MAX_ROTATIONS", 1, 0 ); bool status = initialize( path, max_rotations, true ); free( path ); return status; }
void BaseShadow::config() { if (spool) free(spool); spool = param("SPOOL"); if (!spool) { EXCEPT("SPOOL not specified in config file."); } if (fsDomain) free(fsDomain); fsDomain = param( "FILESYSTEM_DOMAIN" ); if (!fsDomain) { EXCEPT("FILESYSTEM_DOMAIN not specified in config file."); } if (uidDomain) free(uidDomain); uidDomain = param( "UID_DOMAIN" ); if (!uidDomain) { EXCEPT("UID_DOMAIN not specified in config file."); } reconnect_ceiling = param_integer( "RECONNECT_BACKOFF_CEILING", 300 ); reconnect_e_factor = 0.0; reconnect_e_factor = param_double( "RECONNECT_BACKOFF_FACTOR", 2.0, 0.0 ); if( reconnect_e_factor < -1e-4 || reconnect_e_factor > 1e-4) { reconnect_e_factor = 2.0; } m_cleanup_retry_tid = -1; m_num_cleanup_retries = 0; // NOTE: these config knobs are very similar to // LOCAL_UNIVERSE_MAX_JOB_CLEANUP_RETRIES and // LOCAL_UNIVERSE_JOB_CLEANUP_RETRY_DELAY in the local starter. m_max_cleanup_retries = param_integer("SHADOW_MAX_JOB_CLEANUP_RETRIES", 5); m_cleanup_retry_delay = param_integer("SHADOW_JOB_CLEANUP_RETRY_DELAY", 30); m_lazy_queue_update = param_boolean("SHADOW_LAZY_QUEUE_UPDATE", true); }
void TransferQueueManager::parseThrottleConfig(char const *config_param,bool &enable_throttle,double &low,double &high,std::string &throttle_short_horizon,std::string &throttle_long_horizon,time_t &throttle_increment_wait) { enable_throttle = false; std::string throttle_config; if( !param(throttle_config,config_param) ) { return; } char *endptr=NULL; low = strtod(throttle_config.c_str(),&endptr); if( !endptr || !(*endptr == ' ' || *endptr == '\0') ) { EXCEPT("Invalid configuration for %s: %s\n",config_param,throttle_config.c_str()); return; } while( *endptr == ' ' ) endptr++; if( *endptr == '\0' ) { high = low; low = 0.9*high; } else if( strncmp(endptr,"to ",3)==0 ) { endptr += 3; while( *endptr == ' ' ) endptr++; high = strtod(endptr,&endptr); if( !endptr || *endptr != '\0' ) { dprintf(D_ALWAYS,"Invalid configuration for %s: %s\n",config_param,throttle_config.c_str()); return; } } else { EXCEPT("Invalid configuration for %s: %s\n",config_param,throttle_config.c_str()); } // for now, these are hard-coded std::string horizon_param; formatstr(horizon_param,"%s_SHORT_HORIZON",config_param); param(throttle_short_horizon,horizon_param.c_str(),"1m"); formatstr(horizon_param,"%s_LONG_HORIZON",config_param); param(throttle_long_horizon,horizon_param.c_str(),"5m"); std::string wait_param; formatstr(wait_param,"%s_WAIT_BETWEEN_INCREMENTS",config_param); throttle_increment_wait = (time_t) param_integer(wait_param.c_str(),60,0); m_throttle_disk_load = true; }
bool SelfMonitorData::ExportData(ClassAd *ad) { bool success; MyString attribute; if (ad == NULL) { success = false; } else { ad->Assign("MonitorSelfTime", last_sample_time); ad->Assign("MonitorSelfCPUUsage", cpu_usage); ad->Assign("MonitorSelfImageSize", image_size); ad->Assign("MonitorSelfResidentSetSize", rs_size); ad->Assign("MonitorSelfAge", age); ad->Assign("MonitorSelfRegisteredSocketCount", registered_socket_count); ad->Assign("MonitorSelfSecuritySessions", cached_security_sessions); ad->Assign(ATTR_DETECTED_CPUS, param_integer("DETECTED_CORES", 0)); ad->Assign(ATTR_DETECTED_MEMORY, param_integer("DETECTED_MEMORY", 0)); success = true; } return success; }
void HibernationManager::update( void ) { int previous_inteval = m_interval; m_interval = param_integer ( "HIBERNATE_CHECK_INTERVAL", 0 /* default */, 0 /* min; no max */ ); bool change = ( previous_inteval != m_interval ); if ( change ) { dprintf ( D_ALWAYS, "HibernationManager: Hibernation is %s\n", ( m_interval > 0 ? "enabled" : "disabled" ) ); } if ( m_hibernator ) { m_hibernator->update( ); } }
void MachAttributes::credd_test() { // Attempt to perform a NOP on our CREDD_HOST. This will test // our ability to authenticate with DAEMON-level auth, and thus // fetch passwords. If we succeed, we'll advertise the CREDD_HOST char *credd_host = param("CREDD_HOST"); if (credd_host == NULL) { if (m_local_credd != NULL) { free(m_local_credd); m_local_credd = NULL; } return; } if (m_local_credd != NULL) { if (strcmp(m_local_credd, credd_host) == 0) { free(credd_host); return; } else { free(m_local_credd); m_local_credd = NULL; m_last_credd_test = 0; } } time_t now = time(NULL); double thresh = (double)param_integer("CREDD_TEST_INTERVAL", 300); if (difftime(now, m_last_credd_test) > thresh) { Daemon credd(DT_CREDD); if (credd.locate()) { Sock *sock = credd.startCommand(CREDD_NOP, Stream::reli_sock, 20); if (sock != NULL) { sock->decode(); if (sock->end_of_message()) { m_local_credd = credd_host; } } } m_last_credd_test = now; } if (credd_host != m_local_credd) { free(credd_host); } }
void NordugridJobReconfig() { int tmp_int; tmp_int = param_integer( "GRIDMANAGER_GAHP_CALL_TIMEOUT", 5 * 60 ); NordugridJob::setGahpCallTimeout( tmp_int ); // Tell all the resource objects to deal with their new config values NordugridResource *next_resource; NordugridResource::ResourcesByName.startIterations(); while ( NordugridResource::ResourcesByName.iterate( next_resource ) != 0 ) { next_resource->Reconfig(); } }
void ScheddStatistics::Reconfig() { this->RecentWindowMax = param_integer("STATISTICS_WINDOW_SECONDS", 1200, schedd_stats_window_quantum, INT_MAX); this->PublishFlags = IF_BASICPUB | IF_RECENTPUB; char * tmp = param("STATISTICS_TO_PUBLISH"); if (tmp) { this->PublishFlags = generic_stats_ParseConfigString(tmp, "SCHEDD", "SCHEDULER", this->PublishFlags); free(tmp); } SetWindowSize(this->RecentWindowMax); //stats_histogram_sizes::init_sizes_from_param("MAX_HIST_SIZES_LEVELS"); //JobSizes.reconfig(); //JobSizesGoodput.reconfig(); //JobSizesBadput.reconfig(); }
HibernatorBase::SLEEP_STATE UserDefinedToolsHibernator::enterState ( HibernatorBase::SLEEP_STATE state ) const { /** Make sure a tool for this sleep state has been defined */ unsigned index = sleepStateToInt ( state ); if ( NULL == m_tool_paths[index] ) { dprintf ( D_FULLDEBUG, "Hibernator::%s tool not configured.\n", HibernatorBase::sleepStateToString ( state ) ); return HibernatorBase::NONE; } /** Tell DaemonCore to register the process family so we can safely kill everything from the reaper */ FamilyInfo fi; fi.max_snapshot_interval = param_integer ( "PID_SNAPSHOT_INTERVAL", 15 ); /** Run the user tool */ int pid = daemonCore->Create_Process ( m_tool_paths[index], m_tool_args[index], PRIV_CONDOR_FINAL, m_reaper_id, FALSE, FALSE, NULL, NULL, &fi ); if ( FALSE == pid ) { dprintf ( D_ALWAYS, "UserDefinedToolsHibernator::enterState: Create_Process() " "failed\n" ); return HibernatorBase::NONE; } return state; }
void BaseResource::UnregisterJob( BaseJob *job ) { CancelSubmit( job ); pingRequesters.Delete( job ); registeredJobs.Delete( job ); leaseUpdates.Delete( job ); if ( IsEmpty() ) { int delay = param_integer( "GRIDMANAGER_EMPTY_RESOURCE_DELAY", 5*60 ); if ( delay < 0 ) { delay = 0; } deleteMeTid = daemonCore->Register_Timer( delay, (TimerHandlercpp)&BaseResource::DeleteMe, "BaseResource::DeleteMe", (Service*)this ); } }
BaseResource::BaseResource( const char *resource_name ) { resourceName = strdup( resource_name ); deleteMeTid = TIMER_UNSET; resourceDown = false; firstPingDone = false; pingActive = false; pingTimerId = daemonCore->Register_Timer( 0, (TimerHandlercpp)&BaseResource::Ping, "BaseResource::Ping", (Service*)this ); lastPing = 0; lastStatusChange = 0; jobLimit = DEFAULT_MAX_SUBMITTED_JOBS_PER_RESOURCE; hasLeases = false; updateLeasesTimerId = daemonCore->Register_Timer( 0, (TimerHandlercpp)&BaseResource::UpdateLeases, "BaseResource::UpdateLeases", (Service*)this ); lastUpdateLeases = 0; updateLeasesActive = false; updateLeasesCmdActive = false; m_hasSharedLeases = false; m_defaultLeaseDuration = -1; m_sharedLeaseExpiration = 0; _updateCollectorTimerId = daemonCore->Register_Timer ( 0, (TimerHandlercpp)&BaseResource::UpdateCollector, "BaseResource::UpdateCollector", (Service*)this ); _lastCollectorUpdate = 0; _firstCollectorUpdate = true; _collectorUpdateInterval = param_integer ( "GRIDMANAGER_COLLECTOR_UPDATE_INTERVAL", 5*60 ); m_batchStatusActive = false; m_batchPollTid = TIMER_UNSET; m_paramJobPollRate = -1; m_paramJobPollInterval = -1; m_jobPollInterval = 0; }
int countTypes( int max_types, int num_cpus, int** array_ptr, bool except ) { int i, num=0, num_set=0; MyString param_name; MyString cruft_name; int* my_type_nums = new int[max_types]; if( ! array_ptr ) { EXCEPT( "ResMgr:countTypes() called with NULL array_ptr!" ); } // Type 0 is special, user's shouldn't define it. _checkInvalidParam("NUM_SLOTS_TYPE_0", except); // CRUFT _checkInvalidParam("NUM_VIRTUAL_MACHINES_TYPE_0", except); for( i=1; i<max_types; i++ ) { param_name.formatstr("NUM_SLOTS_TYPE_%d", i); if (param_boolean("ALLOW_VM_CRUFT", false)) { cruft_name.formatstr("NUM_VIRTUAL_MACHINES_TYPE_%d", i); my_type_nums[i] = param_integer(param_name.Value(), param_integer(cruft_name.Value(), 0)); } else { my_type_nums[i] = param_integer(param_name.Value(), 0); } if (my_type_nums[i]) { num_set = 1; num += my_type_nums[i]; } } if( num_set ) { // We found type-specific stuff, use that. my_type_nums[0] = 0; } else { // We haven't found any special types yet. Therefore, // we're evenly dividing things, so we only have to figure // out how many nodes to advertise. if (param_boolean("ALLOW_VM_CRUFT", false)) { my_type_nums[0] = param_integer("NUM_SLOTS", param_integer("NUM_VIRTUAL_MACHINES", num_cpus)); } else { my_type_nums[0] = param_integer("NUM_SLOTS", num_cpus); } num = my_type_nums[0]; } *array_ptr = my_type_nums; return num; }
void CCBListener::InitAndReconfig() { int new_heartbeat_interval = param_integer("CCB_HEARTBEAT_INTERVAL",1200,0); if( new_heartbeat_interval != m_heartbeat_interval ) { if( new_heartbeat_interval < 30 && new_heartbeat_interval > 0 ) { new_heartbeat_interval = 30; // CCB server doesn't expect a high rate of unsolicited // input from us dprintf(D_ALWAYS, "CCBListener: using minimum heartbeat interval of %ds\n", new_heartbeat_interval); } m_heartbeat_interval = new_heartbeat_interval; if( m_heartbeat_initialized ) { RescheduleHeartbeat(); } } }
void GlobusResource::Reconfig() { int tmp_int; BaseResource::Reconfig(); gahp->setTimeout( gahpCallTimeout ); tmp_int = param_integer( "GRIDMANAGER_MAX_JOBMANAGERS_PER_RESOURCE", DEFAULT_MAX_JOBMANAGERS_PER_RESOURCE ); if ( tmp_int == 0 ) { submitJMLimit = GM_RESOURCE_UNLIMITED; restartJMLimit = GM_RESOURCE_UNLIMITED; } else { if ( tmp_int < 2 ) { tmp_int = 2; } submitJMLimit = tmp_int / 2; restartJMLimit = tmp_int - submitJMLimit; } // If the jobmanager limits were widened, move jobs from Wanted to // Allowed and signal them while ( ( submitJMsAllowed.Length() + restartJMsAllowed.Length() < submitJMLimit + restartJMLimit ) && ( submitJMsWanted.Length() != 0 || restartJMsWanted.Length() != 0 ) ) { JMComplete( NULL ); } if ( enableGridMonitor ) { // start grid monitor daemonCore->Reset_Timer( checkMonitorTid, 0 ); } else { // stop grid monitor if ( monitorActive || monitorStarting ) { StopMonitor(); } daemonCore->Reset_Timer( checkMonitorTid, TIMER_NEVER ); } }
Timeslice &DCCollector::getBlacklistTimeslice() { std::map< std::string, Timeslice >::iterator itr; itr = blacklist.find(addr()); if( itr == blacklist.end() ) { Timeslice ts; // Blacklist this collector if last failed contact took more // than 1% of the time that has passed since that operation // started. (i.e. if contact fails quickly, don't worry, but // if it takes a long time to fail, be cautious. ts.setTimeslice(0.01); // Set an upper bound of one hour for the collector to be blacklisted. int avoid_time = param_integer("DEAD_COLLECTOR_MAX_AVOIDANCE_TIME",3600); ts.setMaxInterval(avoid_time); ts.setInitialInterval(0); itr = blacklist.insert( std::map< std::string, Timeslice >::value_type(addr(),ts) ).first; } return itr->second; }
int CollectorEngine:: scheduleHousekeeper (int timeout) { // Are we filtering updates that we forward to the view collector? std::string watch_list; param(watch_list,"COLLECTOR_FORWARD_WATCH_LIST", "State,Cpus,Memory,IdleJobs,ClaimId,Capability,ClaimIdList,ChildClaimIds"); m_forwardWatchList.clearAll(); m_forwardWatchList.initializeFromString(watch_list.c_str()); m_forwardFilteringEnabled = param_boolean( "COLLECTOR_FORWARD_FILTERING", false ); // cancel outstanding housekeeping requests if (housekeeperTimerID != -1) { (void) daemonCore->Cancel_Timer(housekeeperTimerID); } // reset for new timer if (timeout < 0) return 0; // set to new timeout interval machineUpdateInterval = timeout; m_forwardInterval = param_integer("COLLECTOR_FORWARD_INTERVAL", machineUpdateInterval / 3, 0); // if timeout interval was non-zero (i.e., housekeeping required) ... if (timeout > 0) { // schedule housekeeper housekeeperTimerID = daemonCore->Register_Timer(machineUpdateInterval, machineUpdateInterval, (TimerHandlercpp)&CollectorEngine::housekeeper, "CollectorEngine::housekeeper",this); if (housekeeperTimerID == -1) return 0; } return 1; }
void BaseJob::BaseJobReconfig() { int tmp_int; if ( periodicPolicyEvalTid != TIMER_UNSET ) { daemonCore->Cancel_Timer( periodicPolicyEvalTid ); periodicPolicyEvalTid = TIMER_UNSET; } tmp_int = param_integer( "PERIODIC_EXPR_INTERVAL", 300 ); if ( tmp_int > 0 ) { periodicPolicyEvalTid = daemonCore->Register_Timer( tmp_int, tmp_int, BaseJob::EvalAllPeriodicJobExprs, "EvalAllPeriodicJobExprs" ); } if ( m_checkRemoteStatusTid == TIMER_UNSET ) { m_checkRemoteStatusTid = daemonCore->Register_Timer( 5, 60, BaseJob::CheckAllRemoteStatus, "BaseJob::CheckAllRemoteStatus" ); } }
void VMUniverseMgr::docheckVMUniverse(void) { char *vm_type = param( "VM_TYPE" ); dprintf( D_ALWAYS, "VM universe will be tested " "to check if it is available\n"); if( init() == false && vm_type ) { // VM universe is desired, but not available // In VMware, some errors may be transient. // For example, when VMware fails to start a new VM // due to an incorrect config file, we are unable to // run 'vmrun' command for a while. // But after some time, we are able to run it again. // So we register a timer to call this function later. m_check_interval = param_integer("VM_RECHECK_INTERVAL", 600); if ( m_check_tid >= 0 ) { daemonCore->Reset_Timer( m_check_tid, m_check_interval ); } else { m_check_tid = daemonCore->Register_Timer(m_check_interval, (TimerHandlercpp)&VMUniverseMgr::docheckVMUniverse, "VMUniverseMgr::docheckVMUniverse", this); } dprintf( D_ALWAYS, "Started a timer to test VM universe after " "%d(seconds)\n", m_check_interval); } else { if ( m_check_tid >= 0 ) { daemonCore->Cancel_Timer( m_check_tid ); m_check_tid = -1; // in the case where we had to use the timer, // make certain we publish our changes. if( resmgr ) { resmgr->eval_and_update_all(); } } } free( vm_type ); }
void CCBListener::Disconnected() { if( m_sock ) { daemonCore->Cancel_Socket( m_sock ); delete m_sock; m_sock = NULL; } if( m_waiting_for_connect ) { m_waiting_for_connect = false; decRefCount(); } m_waiting_for_registration = false; m_registered = false; StopHeartbeat(); if( m_reconnect_timer != -1 ) { return; // already in progress } int reconnect_time = param_integer("CCB_RECONNECT_TIME",60); dprintf(D_ALWAYS, "CCBListener: connection to CCB server %s failed; " "will try to reconnect in %d seconds.\n", m_ccb_address.Value(), reconnect_time); m_reconnect_timer = daemonCore->Register_Timer( reconnect_time, (TimerHandlercpp)&CCBListener::ReconnectTime, "CCBListener::ReconnectTime", this ); ASSERT( m_reconnect_timer != -1 ); }
void Reconfig() { contact_schedd_interval = param_integer ("C_GAHP_CONTACT_SCHEDD_DELAY", 5); // When GSI authentication is used, we're willing to trust schedds // which have the same credential as the job if ( proxySubjectName ) { char *daemon_subjects = param( "GSI_DAEMON_NAME" ); if ( daemon_subjects ) { std::string buff; formatstr( buff, "%s,%s", daemon_subjects, proxySubjectName ); dprintf( D_ALWAYS, "Setting %s=%s\n", "GSI_DAEMON_NAME", buff.c_str() ); // We must use our daemon subsystem prefix in case the // admin used it in the config file. config_insert( "C_GAHP_WORKER_THREAD.GSI_DAEMON_NAME", buff.c_str() ); free( daemon_subjects ); } } }
ResourceRequestList::ResourceRequestList(int protocol_version) : m_send_end_negotiate(false), m_send_end_negotiate_now(false), m_requests_to_fetch(0) { m_protocol_version = protocol_version; m_clear_rejected_autoclusters = false; m_use_resource_request_counts = param_boolean("USE_RESOURCE_REQUEST_COUNTS",true); if ( protocol_version == 0 || m_use_resource_request_counts == false ) { // Protocol version is 0, and schedd resource request lists were introduced // in protocol version 1. so set m_num_to_fetch to 1 so we use the old // protocol of getting one request at a time with this old schedd. // Also we must use the old protocol if admin disabled USE_RESOURCE_REQUEST_COUNTS, // since the new protocol relies on that. m_num_to_fetch = 1; } else { m_num_to_fetch = param_integer("NEGOTIATOR_RESOURCE_REQUEST_LIST_SIZE"); } errcode = 0; current_autocluster = -1; resource_request_count = 0; resource_request_offers = 0; }
void BaseShadow::checkSwap( void ) { int reserved_swap, free_swap; // Reserved swap is specified in megabytes reserved_swap = param_integer( "RESERVED_SWAP", 0 ); reserved_swap *= 1024; if( reserved_swap == 0 ) { // We're not supposed to care about swap space at all, so // none of the rest of the checks matter at all. return; } free_swap = sysapi_swap_space(); dprintf( D_FULLDEBUG, "*** Reserved Swap = %d\n", reserved_swap ); dprintf( D_FULLDEBUG, "*** Free Swap = %d\n", free_swap ); if( free_swap < reserved_swap ) { dprintf( D_ALWAYS, "Not enough reserved swap space\n" ); DC_Exit( JOB_NO_MEM ); } }
void Defrag::config() { ASSERT( param(m_state_file,"DEFRAG_STATE_FILE") ); if( m_last_poll==0 ) { loadState(); } int old_polling_interval = m_polling_interval; m_polling_interval = param_integer("DEFRAG_INTERVAL",600); if( m_polling_interval <= 0 ) { dprintf(D_ALWAYS, "DEFRAG_INTERVAL=%d, so no pool defragmentation " "will be done.\n", m_polling_interval); if( m_polling_timer != -1 ) { daemonCore->Cancel_Timer(m_polling_timer); m_polling_timer = -1; } } else if( m_polling_timer >= 0 ) { if( old_polling_interval != m_polling_interval ) { daemonCore->Reset_Timer_Period( m_polling_timer, m_polling_interval); } } else { time_t now = time(NULL); int first_time = 0; if( m_last_poll != 0 && now-m_last_poll < m_polling_interval && m_last_poll <= now ) { first_time = m_polling_interval - (now-m_last_poll); } m_polling_timer = daemonCore->Register_Timer( first_time, m_polling_interval, (TimerHandlercpp)&Defrag::poll, "Defrag::poll", this ); } if( old_polling_interval != m_polling_interval && m_polling_interval > 0 ) { dprintf(D_ALWAYS, "Will evaluate defragmentation policy every DEFRAG_INTERVAL=" "%d seconds.\n", m_polling_interval); } m_draining_per_hour = param_double("DEFRAG_DRAINING_MACHINES_PER_HOUR",0,0); double rate = m_draining_per_hour/3600.0*m_polling_interval; m_draining_per_poll = (int)floor(rate + 0.00001); if( m_draining_per_poll < 0 ) m_draining_per_poll = 0; double error_per_hour = (rate - m_draining_per_poll)/m_polling_interval*3600.0; m_draining_per_poll_hour = (int)floor(error_per_hour + 0.00001); if( m_draining_per_hour < 0 || m_polling_interval > 3600 ) { m_draining_per_hour = 0; } double error_per_day = (error_per_hour - m_draining_per_poll_hour)*24.0; m_draining_per_poll_day = (int)floor(error_per_day + 0.5); if( m_draining_per_poll_day < 0 || m_polling_interval > 3600*24 ) { m_draining_per_poll_day = 0; } dprintf(D_ALWAYS,"polling interval %ds, DEFRAG_DRAINING_MACHINES_PER_HOUR = %f/hour = %d/interval + %d/hour + %d/day\n", m_polling_interval,m_draining_per_hour,m_draining_per_poll, m_draining_per_poll_hour,m_draining_per_poll_day); m_max_draining = param_integer("DEFRAG_MAX_CONCURRENT_DRAINING",-1,-1); m_max_whole_machines = param_integer("DEFRAG_MAX_WHOLE_MACHINES",-1,-1); ASSERT( param(m_defrag_requirements,"DEFRAG_REQUIREMENTS") ); validateExpr( m_defrag_requirements.c_str(), "DEFRAG_REQUIREMENTS" ); ASSERT( param(m_whole_machine_expr,"DEFRAG_WHOLE_MACHINE_EXPR") ); validateExpr( m_whole_machine_expr.c_str(), "DEFRAG_WHOLE_MACHINE_EXPR" ); ASSERT( param(m_draining_schedule_str,"DEFRAG_DRAINING_SCHEDULE") ); if( m_draining_schedule_str.empty() ) { m_draining_schedule = DRAIN_GRACEFUL; m_draining_schedule_str = "graceful"; } else { m_draining_schedule = getDrainingScheduleNum(m_draining_schedule_str.c_str()); if( m_draining_schedule < 0 ) { EXCEPT("Invalid draining schedule: %s\n",m_draining_schedule_str.c_str()); } } MyString rank; param(rank,"DEFRAG_RANK"); if( rank.IsEmpty() ) { m_rank_ad.Delete(ATTR_RANK); } else { if( !m_rank_ad.AssignExpr(ATTR_RANK,rank.Value()) ) { EXCEPT("Invalid expression for DEFRAG_RANK: %s\n", rank.Value()); } } int update_interval = param_integer("DEFRAG_UPDATE_INTERVAL", 600); if(m_public_ad_update_interval != update_interval) { m_public_ad_update_interval = update_interval; dprintf(D_FULLDEBUG, "Setting update interval to %d\n", m_public_ad_update_interval); if(m_public_ad_update_timer >= 0) { daemonCore->Reset_Timer_Period( m_public_ad_update_timer, m_public_ad_update_interval); } else { m_public_ad_update_timer = daemonCore->Register_Timer( 0, m_public_ad_update_interval, (TimerHandlercpp)&Defrag::updateCollector, "Defrag::updateCollector", this); } } if (param(m_cancel_requirements, "DEFRAG_CANCEL_REQUIREMENTS")) { validateExpr( m_cancel_requirements.c_str(), "DEFRAG_CANCEL_REQUIREMENTS" ); } else { m_cancel_requirements = ""; } param(m_defrag_name,"DEFRAG_NAME"); int stats_quantum = m_polling_interval; int stats_window = 10*stats_quantum; m_stats.SetWindowSize(stats_window,stats_quantum); }
void init_params() { char *tmp; static int master_name_in_config = 0; if( ! master_name_in_config ) { // First time, or we know it's not in the config file. if( ! MasterName ) { // Not set on command line tmp = param( "MASTER_NAME" ); if( tmp ) { MasterName = build_valid_daemon_name( tmp ); master_name_in_config = 1; free( tmp ); } } } else { delete [] MasterName; tmp = param( "MASTER_NAME" ); MasterName = build_valid_daemon_name( tmp ); free( tmp ); } if( MasterName ) { dprintf( D_FULLDEBUG, "Using name: %s\n", MasterName ); } if (!param_boolean_crufty("START_MASTER", true)) { dprintf( D_ALWAYS, "START_MASTER was set to FALSE, shutting down.\n" ); StartDaemons = FALSE; main_shutdown_graceful(); } StartDaemons = TRUE; if (!param_boolean_crufty("START_DAEMONS", true)) { dprintf( D_ALWAYS, "START_DAEMONS flag was set to FALSE. Not starting daemons.\n" ); StartDaemons = FALSE; } // If we were sent the daemons_off command, don't forget that // here. if( GotDaemonsOff ) { StartDaemons = FALSE; } PublishObituaries = param_boolean_crufty("PUBLISH_OBITUARIES", true) ? TRUE : FALSE; Lines = param_integer("OBITUARY_LOG_LENGTH",20); master_backoff_constant = param_integer( "MASTER_BACKOFF_CONSTANT", 9, 1 ); master_backoff_ceiling = param_integer( "MASTER_BACKOFF_CEILING", 3600,1 ); master_backoff_factor = param_double( "MASTER_BACKOFF_FACTOR", 2.0, 0 ); if( master_backoff_factor <= 0.0 ) { master_backoff_factor = 2.0; } master_recover_time = param_integer( "MASTER_RECOVER_FACTOR", 300, 1 ); update_interval = param_integer( "MASTER_UPDATE_INTERVAL", 5 * MINUTE, 1 ); check_new_exec_interval = param_integer( "MASTER_CHECK_NEW_EXEC_INTERVAL", 5*MINUTE ); new_bin_delay = param_integer( "MASTER_NEW_BINARY_DELAY", 2*MINUTE, 1 ); new_bin_restart_mode = GRACEFUL; char * restart_mode = param("MASTER_NEW_BINARY_RESTART"); if (restart_mode) { #if 1 StopStateT mode = StringToStopState(restart_mode); #else static const struct { const char * text; StopStateT mode; } modes[] = { { "GRACEFUL", GRACEFUL }, { "PEACEFUL", PEACEFUL }, { "NEVER", NONE }, { "NONE", NONE }, { "NO", NONE }, // { "FAST", FAST }, // { "KILL", KILL }, }; StopStateT mode = (StopStateT)-1; // prime with -1 so we can detect bad input. for (int ii = 0; ii < (int)COUNTOF(modes); ++ii) { if (MATCH == strcasecmp(restart_mode, modes[ii].text)) { mode = modes[ii].mode; break; } } #endif if (mode == (StopStateT)-1) { dprintf(D_ALWAYS, "%s is not a valid value for MASTER_NEW_BINARY_RESTART. using GRACEFUL\n", restart_mode); } if (mode >= 0 && mode <= NONE) new_bin_restart_mode = mode; free(restart_mode); } preen_interval = param_integer( "PREEN_INTERVAL", 24*HOUR, 0 ); if(preen_interval == 0) { EXCEPT("PREEN_INTERVAL in the condor configuration is too low (0). Please set it to an integer in the range 1 to %d (default %d). To disable condor_preen entirely, comment out PREEN.", INT_MAX, 24*HOUR); } shutdown_fast_timeout = param_integer( "SHUTDOWN_FAST_TIMEOUT", 5*MINUTE, 1 ); shutdown_graceful_timeout = param_integer( "SHUTDOWN_GRACEFUL_TIMEOUT", 30*MINUTE, 1 ); AllowAdminCommands = param_boolean( "ALLOW_ADMIN_COMMANDS", true ); if( FS_Preen ) { free( FS_Preen ); } FS_Preen = param( "PREEN" ); }
GlobusResource::ReadFileStatus GlobusResource::ReadMonitorJobStatusFile() { // return true if file successfully processed and jobs notified, // else return false. // TODO should distinguish between temporary and permanent problems. // e.g. if file is incomplete, retry after short delay FILE *fp; char buff[1024]; char contact[1024]; int status; int scan_start = 0; int scan_finish = 0; int job_count = 0; if(monitorJobStatusFile == NULL) { EXCEPT("Consistency problem for GlobusResource::ReadMonitorJobStatusFile %s, null job status file name", resourceName); } fp = safe_fopen_wrapper_follow( monitorJobStatusFile, "r" ); if ( fp == NULL ) { dprintf( D_ALWAYS, "Failed to open grid_monitor job status file %s\n", monitorJobStatusFile ); return RFS_ERROR; } if ( fgets( buff, sizeof(buff), fp ) == NULL ) { if( feof(fp) ) { dprintf( D_FULLDEBUG, "grid_monitor job status file empty (%s), treating as partial.\n", monitorJobStatusFile ); fclose( fp ); return RFS_PARTIAL; } dprintf( D_ALWAYS, "Can't read grid_monitor job status file %s\n", monitorJobStatusFile ); fclose( fp ); return RFS_ERROR; } if ( sscanf( buff, "%d %d", &scan_start, &scan_finish ) != 2 ) { dprintf( D_ALWAYS, "Failed to read scan times from grid_monitor " "status file %s\n", monitorJobStatusFile ); fclose( fp ); return RFS_ERROR; } bool found_eof = false; while ( fgets( buff, sizeof(buff), fp ) != NULL ) { contact[0] = '\0'; status = 0; const char * MAGIC_EOF = "GRIDMONEOF"; if(strncmp(buff, MAGIC_EOF, strlen(MAGIC_EOF)) == 0) { found_eof = true; break; } if ( sscanf( buff, "%s %d", contact, &status ) == 2 && *contact != '\0' && status > 0 ) { int rc; GlobusJob *job = NULL; job_count++; rc = JobsByContact.lookup( HashKey( globusJobId(contact) ), job ); if ( rc == 0 && job != NULL ) { if ( status == GLOBUS_GRAM_PROTOCOL_JOB_STATE_DONE ) { status=GLOBUS_GRAM_PROTOCOL_JOB_STATE_STAGE_OUT; } // Don't flood the log file // with a long stream of identical job status updates. // We do need to send identical job status updates to // the job so that it can track the last time we // received an update on its status. if ( status != job->globusState ) { dprintf(D_FULLDEBUG,"Sending callback of %d to %d.%d (%s)\n",status,job->procID.cluster,job->procID.proc, resourceName); } job->GramCallback( status, 0 ); } } } fclose( fp ); int limit = param_integer( "GRID_MONITOR_NO_STATUS_TIMEOUT", 15*60 ); int now = time(NULL); GlobusJob *next_job; registeredJobs.Rewind(); while ( (next_job = (GlobusJob *)registeredJobs.Next()) != NULL ) { if ( next_job->jobContact && now > next_job->lastRemoteStatusUpdate + limit ) { next_job->SetEvaluateState(); } } dprintf( D_FULLDEBUG, "Read %s grid_monitor status file for %s: " "scan start=%d, scan finish=%d, job count=%d\n", found_eof ? "full" : "partial", resourceName, scan_start, scan_finish, job_count ); if(found_eof) return RFS_OK; return RFS_PARTIAL; }
void GlobusResource::CheckMonitor() { BaseJob *base_job = NULL; GlobusJob *job; // TODO should we require our jobs to request the grid monitor before // we'll start it up? // TODO what if we're in the middle of a ping when we get here? Either // delay until ping is done or have seperate GahpClient // TODO if resource is down, should we delay any actions? daemonCore->Reset_Timer( checkMonitorTid, TIMER_NEVER ); dprintf(D_FULLDEBUG, "grid_monitor for %s entering CheckMonitor\n", resourceName); if ( m_versionKnown && m_isGt5 ) { dprintf( D_FULLDEBUG, "Disabling grid_monitor for GRAM5 server %s\n", resourceName ); return; } if ( monitorGahp->isInitialized() == false ) { dprintf( D_ALWAYS, "GAHP server not initialized yet, not submitting " "grid_monitor now\n" ); daemonCore->Reset_Timer( checkMonitorTid, 5 ); return; } if ( !enableGridMonitor ) { return; } if ( time(NULL) < monitorRetryTime ) { daemonCore->Reset_Timer( checkMonitorTid, monitorRetryTime - time(NULL) ); return; } if ( firstPingDone == false ) { dprintf(D_FULLDEBUG,"grid_monitor for %s: first ping not done yet, " "will retry later\n", resourceName); daemonCore->Reset_Timer( checkMonitorTid, 5 ); return; } if ( monitorSubmitActive ) { int rc; std::string job_contact; monitorGahp->setMode( GahpClient::results_only ); rc = monitorGahp->globus_gram_client_job_request( NULL, NULL, 0, NULL, job_contact, false ); if ( rc == GAHPCLIENT_COMMAND_NOT_SUBMITTED || rc == GAHPCLIENT_COMMAND_PENDING ) { // do nothing } else if ( rc == 0 ) { // successful submit monitorGramJobId = strdup( job_contact.c_str() ); monitorSubmitActive = false; } else { // submit failed dprintf(D_ALWAYS, "grid_monitor job submit failed for resource %s, gram error %d (%s)\n", resourceName, rc, monitorGahp->globus_gram_client_error_string(rc)); monitorSubmitActive = false; AbandonMonitor(); return; } } if ( monitorActive == false && monitorStarting == false ) { monitorStarting = true; if ( SubmitMonitorJob() == true ) { daemonCore->Reset_Timer( checkMonitorTid, 30 ); } else { dprintf(D_ALWAYS, "Unable to start grid_monitor for resource %s\n", resourceName); // TODO: Do nice retry? AbandonMonitor(); } } else { int rc; struct stat file_status; int job_status_mod_time; int log_mod_time; if(monitorJobStatusFile == NULL) { EXCEPT("Consistency problem for GlobusResource %s, null job status file name", resourceName); } if(monitorLogFile == NULL) { EXCEPT("Consistency problem for GlobusResource %s, null monitor log file name", resourceName); } rc = stat( monitorJobStatusFile, &file_status ); if ( rc < 0 ) { EXCEPT( "stat(%s) failed, errno=%d", monitorJobStatusFile, errno ); } job_status_mod_time = file_status.st_mtime; rc = stat( monitorLogFile, &file_status ); if ( rc < 0 ) { EXCEPT( "stat(%s) failed, errno=%d", monitorLogFile, errno ); } log_mod_time = file_status.st_mtime; if ( job_status_mod_time > jobStatusFileLastReadTime ) { dprintf(D_FULLDEBUG, "grid_monitor job status for %s file has been refreshed.\n", resourceName); ReadFileStatus status = ReadMonitorJobStatusFile(); if(status == RFS_OK) { dprintf(D_FULLDEBUG, "Read grid_monitor status file for %s successfully\n", resourceName); jobStatusFileLastReadTime = time(NULL); jobStatusFileLastUpdate = time(NULL); daemonCore->Reset_Timer( checkMonitorTid, 30 ); } else if(status == RFS_PARTIAL) { const int RETRY_TIME = 10; dprintf(D_FULLDEBUG,"*** status file is partial, " "will try again in %d seconds\n", RETRY_TIME); daemonCore->Reset_Timer( checkMonitorTid, RETRY_TIME ); } else if(status == RFS_ERROR) { dprintf(D_ALWAYS,"grid_monitor: error reading job status " "file for %s, stopping grid monitor\n", resourceName); // TODO: Try to restart monitor? AbandonMonitor(); return; } else { EXCEPT("ReadMonitorJobStatusFile returned unexpected %d " "(for %s)", (int)status, resourceName); } } int log_file_timeout = param_integer("GRID_MONITOR_HEARTBEAT_TIMEOUT", 300); int monitor_retry_duration = param_integer("GRID_MONITOR_RETRY_DURATION", 900); if ( log_mod_time > logFileLastReadTime ) { dprintf(D_FULLDEBUG, "grid_monitor log file for %s updated.\n", resourceName); rc = ReadMonitorLogFile(); switch( rc ) { case 0: // Normal / OK dprintf(D_FULLDEBUG, "grid_monitor log file for %s looks normal\n", resourceName); if ( monitorStarting ) { dprintf(D_ALWAYS, "Successfully started grid_monitor " "for %s\n", resourceName); monitorStarting = false; monitorFirstStartup = false; monitorActive = true; registeredJobs.Rewind(); while ( registeredJobs.Next( base_job ) ) { job = dynamic_cast<GlobusJob*>( base_job ); job->SetEvaluateState(); } } logFileLastReadTime = time(NULL); daemonCore->Reset_Timer( checkMonitorTid, 30 ); break; case 1: // Exitted normally (should restart) dprintf(D_FULLDEBUG, "grid_monitor for %s reached maximum lifetime, " "restarting...\n", resourceName); if ( SubmitMonitorJob() == true ) { dprintf(D_FULLDEBUG, "grid_monitor for %s restarted.\n", resourceName); daemonCore->Reset_Timer( checkMonitorTid, 30 ); } else { dprintf(D_ALWAYS, "Unable to restart grid_monitor for resource %s\n", resourceName); // TODO: Try to restart monitor? AbandonMonitor(); } break; case 2: // Exitted with error dprintf(D_ALWAYS,"Error with grid_monitor for %s, stopping.\n", resourceName); // TODO: Try to restart monitor? AbandonMonitor(); break; default: EXCEPT( "Unknown return value %d from ReadLogFile", rc ); } } else if ( time(NULL) > logFileLastReadTime + log_file_timeout && !monitorStarting ) { dprintf( D_ALWAYS, "Haven't heard from running grid_monitor " "at %s for %d seconds, trying new job submission\n", resourceName, log_file_timeout ); if( ! SubmitMonitorJob() ) { dprintf(D_ALWAYS, "Failed to restart grid_monitor. Giving up on grid_monitor for site %s\n", resourceName); AbandonMonitor(); } daemonCore->Reset_Timer( checkMonitorTid, 30); } else if ( monitorStarting && time(NULL) > logFileLastReadTime + monitor_retry_duration) { dprintf( D_ALWAYS, "Haven't heard from new grid_monitor " "at %s for %d seconds, giving up\n", resourceName, monitor_retry_duration ); AbandonMonitor(); } else { daemonCore->Reset_Timer( checkMonitorTid, 30 ); } } return; }
void BaseResource::Reconfig() { int tmp_int; char *param_value; std::string param_name; tmp_int = param_integer( "GRIDMANAGER_RESOURCE_PROBE_INTERVAL", 5 * 60 ); setProbeInterval( tmp_int ); jobLimit = -1; formatstr( param_name, "GRIDMANAGER_MAX_SUBMITTED_JOBS_PER_RESOURCE_%s", ResourceType() ); param_value = param( param_name.c_str() ); if ( param_value == NULL ) { param_value = param( "GRIDMANAGER_MAX_SUBMITTED_JOBS_PER_RESOURCE" ); } if ( param_value != NULL ) { char *tmp1; char *tmp2; StringList limits( param_value ); limits.rewind(); if ( limits.number() > 0 ) { jobLimit = atoi( limits.next() ); while ( (tmp1 = limits.next()) && (tmp2 = limits.next()) ) { if ( strstr( resourceName, tmp1 ) != 0 ) { jobLimit = atoi( tmp2 ); } } } free( param_value ); } if ( jobLimit <= 0 ) { jobLimit = DEFAULT_MAX_SUBMITTED_JOBS_PER_RESOURCE; } // If the jobLimit was widened, move jobs from Wanted to Allowed and // signal them while ( submitsAllowed.Length() < jobLimit && submitsWanted.Length() > 0 ) { BaseJob *wanted_job = submitsWanted.Head(); submitsWanted.Delete( wanted_job ); submitsAllowed.Append( wanted_job ); wanted_job->SetEvaluateState(); } formatstr( param_name, "GRIDMANAGER_JOB_PROBE_RATE_%s", ResourceType() ); m_paramJobPollRate = param_integer( param_name.c_str(), -1 ); if ( m_paramJobPollRate <= 0 ) { m_paramJobPollRate = param_integer( "GRIDMANAGER_JOB_PROBE_RATE", DEFAULT_JOB_POLL_RATE ); } if ( m_paramJobPollRate <= 0 ) { m_paramJobPollRate = DEFAULT_JOB_POLL_RATE; } const char *legacy_job_poll_param = NULL; const char *type = ResourceType(); if ( strcmp( type, "condor" ) == 0 ) { legacy_job_poll_param = "CONDOR_JOB_POLL_INTERVAL"; } else if ( strcmp( type, "batch" ) == 0 || strcmp( type, "pbs" ) == 0 || strcmp( type, "lsf" ) == 0 || strcmp( type, "nqs" ) == 0 || strcmp( type, "sge" ) == 0 || strcmp( type, "naregi" ) == 0 ) { legacy_job_poll_param = "INFN_JOB_POLL_INTERVAL"; } formatstr( param_name, "GRIDMANAGER_JOB_PROBE_INTERVAL_%s", ResourceType() ); m_paramJobPollInterval = param_integer( param_name.c_str(), -1 ); if ( m_paramJobPollInterval <= 0 ) { m_paramJobPollInterval = param_integer( "GRIDMANAGER_JOB_PROBE_INTERVAL", -1 ); } if ( m_paramJobPollInterval <= 0 && legacy_job_poll_param ) { m_paramJobPollInterval = param_integer( legacy_job_poll_param, -1 ); } if ( m_paramJobPollInterval <= 0 ) { m_paramJobPollInterval = DEFAULT_JOB_POLL_INTERVAL; } SetJobPollInterval(); _collectorUpdateInterval = param_integer ( "GRIDMANAGER_COLLECTOR_UPDATE_INTERVAL", 5*60 ); }