void Defrag::poll_cancel(MachineSet &cancelled_machines) { if (!m_cancel_requirements.size()) { return; } MachineSet draining_whole_machines; std::stringstream draining_whole_machines_ss; draining_whole_machines_ss << "(" << m_cancel_requirements << ") && (" << DRAINING_CONSTRAINT << ")"; int num_draining_whole_machines = countMachines(draining_whole_machines_ss.str().c_str(), "<DEFRAG_CANCEL_REQUIREMENTS>", &draining_whole_machines); if (num_draining_whole_machines) { dprintf(D_ALWAYS, "Of the whole machines, %d are in the draining state.\n", num_draining_whole_machines); } else { // Early exit: nothing to do. return; } ClassAdList startdAds; if (!queryMachines(DRAINING_CONSTRAINT, "DRAINING_CONSTRAINT <all draining slots>",startdAds)) { return; } startdAds.Shuffle(); startdAds.Sort(StartdSortFunc,&m_rank_ad); startdAds.Open(); unsigned int cancel_count = 0; ClassAd *startd_ad_ptr; while ( (startd_ad_ptr=startdAds.Next()) ) { if (!startd_ad_ptr) continue; ClassAd &startd_ad = *startd_ad_ptr; std::string machine; std::string name; startd_ad.LookupString(ATTR_NAME,name); slotNameToDaemonName(name,machine); if( !cancelled_machines.count(machine) && draining_whole_machines.count(machine) ) { cancel_drain(startd_ad); cancelled_machines.insert(machine); cancel_count ++; } } startdAds.Close(); dprintf(D_ALWAYS, "Cancelled draining of %u whole machines.\n", cancel_count); }
void Defrag::poll() { dprintf(D_FULLDEBUG,"Evaluating defragmentation policy.\n"); // If we crash during this polling cycle, we will have saved // the time of the last poll, so the next cycle will be // scheduled on the false assumption that a cycle ran now. In // this way, we error on the side of draining too little // rather than too much. time_t now = time(NULL); time_t prev = m_last_poll; m_last_poll = now; saveState(); m_stats.Tick(); int num_to_drain = m_draining_per_poll; time_t last_hour = (prev / 3600)*3600; time_t current_hour = (now / 3600)*3600; time_t last_day = (prev / (3600*24))*3600*24; time_t current_day = (now / (3600*24))*3600*24; if( current_hour != last_hour ) { num_to_drain += prorate(m_draining_per_poll_hour,now-current_hour,3600,m_polling_interval); } if( current_day != last_day ) { num_to_drain += prorate(m_draining_per_poll_day,now-current_day,3600*24,m_polling_interval); } int num_draining = countMachines(DRAINING_CONSTRAINT,"<InternalDrainingConstraint>"); m_stats.MachinesDraining = num_draining; MachineSet whole_machines; int num_whole_machines = countMachines(m_whole_machine_expr.c_str(),"DEFRAG_WHOLE_MACHINE_EXPR",&whole_machines); m_stats.WholeMachines = num_whole_machines; dprintf(D_ALWAYS,"There are currently %d draining and %d whole machines.\n", num_draining,num_whole_machines); queryDrainingCost(); // If possible, cancel some drains. MachineSet cancelled_machines; poll_cancel(cancelled_machines); if( num_to_drain <= 0 ) { dprintf(D_ALWAYS,"Doing nothing, because number to drain in next %ds is calculated to be 0.\n", m_polling_interval); return; } if( (int)ceil(m_draining_per_hour) <= 0 ) { dprintf(D_ALWAYS,"Doing nothing, because DEFRAG_DRAINING_MACHINES_PER_HOUR=%f\n", m_draining_per_hour); return; } if( m_max_draining == 0 ) { dprintf(D_ALWAYS,"Doing nothing, because DEFRAG_MAX_CONCURRENT_DRAINING=0\n"); return; } if( m_max_whole_machines == 0 ) { dprintf(D_ALWAYS,"Doing nothing, because DEFRAG_MAX_WHOLE_MACHINES=0\n"); return; } if( m_max_draining >= 0 ) { if( num_draining >= m_max_draining ) { dprintf(D_ALWAYS,"Doing nothing, because DEFRAG_MAX_CONCURRENT_DRAINING=%d and there are %d draining machines.\n", m_max_draining, num_draining); return; } else if( num_draining < 0 ) { dprintf(D_ALWAYS,"Doing nothing, because DEFRAG_MAX_CONCURRENT_DRAINING=%d and the query to count draining machines failed.\n", m_max_draining); return; } } if( m_max_whole_machines >= 0 ) { if( num_whole_machines >= m_max_whole_machines ) { dprintf(D_ALWAYS,"Doing nothing, because DEFRAG_MAX_WHOLE_MACHINES=%d and there are %d whole machines.\n", m_max_whole_machines, num_whole_machines); return; } } // Even if m_max_whole_machines is -1 (infinite), we still need // the list of whole machines in order to filter them out in // the draining selection algorithm, so abort now if the // whole machine query failed. if( num_whole_machines < 0 ) { dprintf(D_ALWAYS,"Doing nothing, because the query to find whole machines failed.\n"); return; } dprintf(D_ALWAYS,"Looking for %d machines to drain.\n",num_to_drain); ClassAdList startdAds; std::string requirements; sprintf(requirements,"(%s) && Draining =!= true",m_defrag_requirements.c_str()); if( !queryMachines(requirements.c_str(),"DEFRAG_REQUIREMENTS",startdAds) ) { dprintf(D_ALWAYS,"Doing nothing, because the query to select machines matching DEFRAG_REQUIREMENTS failed.\n"); return; } startdAds.Shuffle(); startdAds.Sort(StartdSortFunc,&m_rank_ad); startdAds.Open(); int num_drained = 0; ClassAd *startd_ad_ptr; MachineSet machines_done; while( (startd_ad_ptr=startdAds.Next()) ) { if (!startd_ad_ptr) continue; ClassAd &startd_ad = *startd_ad_ptr; std::string machine; std::string name; startd_ad.LookupString(ATTR_NAME,name); slotNameToDaemonName(name,machine); // If we have already cancelled draining on this machine, ignore it for this cycle. if( cancelled_machines.count(machine) ) { dprintf(D_FULLDEBUG, "Skipping %s: already cancelled draining of %s in this cycle.\n", name.c_str(),machine.c_str()); continue; } if( machines_done.count(machine) ) { dprintf(D_FULLDEBUG, "Skipping %s: already attempted to drain %s in this cycle.\n", name.c_str(),machine.c_str()); continue; } if( whole_machines.count(machine) ) { dprintf(D_FULLDEBUG, "Skipping %s: because it is already running as a whole machine.\n", name.c_str()); continue; } if( drain(startd_ad) ) { machines_done.insert(machine); if( ++num_drained >= num_to_drain ) { dprintf(D_ALWAYS, "Drained maximum number of machines allowed in this cycle (%d).\n", num_to_drain); break; } } } startdAds.Close(); dprintf(D_ALWAYS,"Drained %d machines (wanted to drain %d machines).\n", num_drained,num_drained); dprintf(D_FULLDEBUG,"Done evaluating defragmentation policy.\n"); }
void Rooster::poll() { dprintf(D_FULLDEBUG,"C**k-a-doodle-doo! (Time to look for machines to wake up.)\n"); ClassAdList startdAds; CondorQuery unhibernateQuery(STARTD_AD); ExprTree *requirements = NULL; if( ParseClassAdRvalExpr( m_unhibernate_constraint.Value(), requirements )!=0 || requirements==NULL ) { EXCEPT("Invalid expression for ROOSTER_UNHIBERNATE: %s\n", m_unhibernate_constraint.Value()); } unhibernateQuery.addANDConstraint(m_unhibernate_constraint.Value()); CollectorList* collects = daemonCore->getCollectorList(); ASSERT( collects ); QueryResult result; result = collects->query(unhibernateQuery,startdAds); if( result != Q_OK ) { dprintf(D_ALWAYS, "Couldn't fetch startd ads using constraint " "ROOSTER_UNHIBERNATE=%s: %s\n", m_unhibernate_constraint.Value(), getStrQueryResult(result)); return; } dprintf(D_FULLDEBUG,"Got %d startd ads matching ROOSTER_UNHIBERNATE=%s\n", startdAds.MyLength(), m_unhibernate_constraint.Value()); startdAds.Sort(StartdSortFunc,&m_rank_ad); startdAds.Open(); int num_woken = 0; ClassAd *startd_ad; HashTable<MyString,bool> machines_done(MyStringHash); while( (startd_ad=startdAds.Next()) ) { MyString machine; MyString name; startd_ad->LookupString(ATTR_MACHINE,machine); startd_ad->LookupString(ATTR_NAME,name); if( machines_done.exists(machine)==0 ) { dprintf(D_FULLDEBUG, "Skipping %s: already attempted to wake up %s in this cycle.\n", name.Value(),machine.Value()); continue; } // in case the unhibernate expression is time-sensitive, // re-evaluate it now to make sure it still passes if( !EvalBool(startd_ad,requirements) ) { dprintf(D_ALWAYS, "Skipping %s: ROOSTER_UNHIBERNATE is no longer true.\n", name.Value()); continue; } if( wakeUp(startd_ad) ) { machines_done.insert(machine,true); if( ++num_woken >= m_max_unhibernate && m_max_unhibernate > 0 ) { dprintf(D_ALWAYS, "Reached ROOSTER_MAX_UNHIBERNATE=%d in this cycle.\n", m_max_unhibernate); break; } } } startdAds.Close(); delete requirements; requirements = NULL; if( startdAds.MyLength() ) { dprintf(D_FULLDEBUG,"Done sending wakeup calls.\n"); } }
int main (int argc, char *argv[]) { #if !defined(WIN32) install_sig_handler(SIGPIPE, (SIG_HANDLER)SIG_IGN ); #endif // initialize to read from config file myDistro->Init( argc, argv ); myName = argv[0]; config(); dprintf_config_tool_on_error(0); // The arguments take two passes to process --- the first pass // figures out the mode, after which we can instantiate the required // query object. We add implied constraints from the command line in // the second pass. firstPass (argc, argv); // if the mode has not been set, it is STARTD_NORMAL if (mode == MODE_NOTSET) { setMode (MODE_STARTD_NORMAL, 0, DEFAULT); } // instantiate query object if (!(query = new CondorQuery (type))) { dprintf_WriteOnErrorBuffer(stderr, true); fprintf (stderr, "Error: Out of memory\n"); exit (1); } // if a first-pass setMode set a mode_constraint, apply it now to the query object if (mode_constraint && ! explicit_format) { query->addANDConstraint(mode_constraint); } // set pretty print style implied by the type of entity being queried // but do it with default priority, so that explicitly requested options // can override it switch (type) { #ifdef HAVE_EXT_POSTGRESQL case QUILL_AD: setPPstyle(PP_QUILL_NORMAL, 0, DEFAULT); break; #endif /* HAVE_EXT_POSTGRESQL */ case DEFRAG_AD: setPPstyle(PP_GENERIC_NORMAL, 0, DEFAULT); break; case STARTD_AD: setPPstyle(PP_STARTD_NORMAL, 0, DEFAULT); break; case SCHEDD_AD: setPPstyle(PP_SCHEDD_NORMAL, 0, DEFAULT); break; case MASTER_AD: setPPstyle(PP_MASTER_NORMAL, 0, DEFAULT); break; case CKPT_SRVR_AD: setPPstyle(PP_CKPT_SRVR_NORMAL, 0, DEFAULT); break; case COLLECTOR_AD: setPPstyle(PP_COLLECTOR_NORMAL, 0, DEFAULT); break; case STORAGE_AD: setPPstyle(PP_STORAGE_NORMAL, 0, DEFAULT); break; case NEGOTIATOR_AD: setPPstyle(PP_NEGOTIATOR_NORMAL, 0, DEFAULT); break; case GRID_AD: setPPstyle(PP_GRID_NORMAL, 0, DEFAULT); break; case GENERIC_AD: setPPstyle(PP_GENERIC, 0, DEFAULT); break; case ANY_AD: setPPstyle(PP_ANY_NORMAL, 0, DEFAULT); break; default: setPPstyle(PP_VERBOSE, 0, DEFAULT); } // set the constraints implied by the mode switch (mode) { #ifdef HAVE_EXT_POSTGRESQL case MODE_QUILL_NORMAL: #endif /* HAVE_EXT_POSTGRESQL */ case MODE_DEFRAG_NORMAL: case MODE_STARTD_NORMAL: case MODE_MASTER_NORMAL: case MODE_CKPT_SRVR_NORMAL: case MODE_SCHEDD_NORMAL: case MODE_SCHEDD_SUBMITTORS: case MODE_COLLECTOR_NORMAL: case MODE_NEGOTIATOR_NORMAL: case MODE_STORAGE_NORMAL: case MODE_GENERIC_NORMAL: case MODE_ANY_NORMAL: case MODE_GRID_NORMAL: case MODE_HAD_NORMAL: break; case MODE_OTHER: // tell the query object what the type we're querying is query->setGenericQueryType(genericType); free(genericType); genericType = NULL; break; case MODE_STARTD_AVAIL: // For now, -avail shows you machines avail to anyone. sprintf (buffer, "%s == \"%s\"", ATTR_STATE, state_to_string(unclaimed_state)); if (diagnose) { printf ("Adding constraint [%s]\n", buffer); } query->addORConstraint (buffer); break; case MODE_STARTD_RUN: sprintf (buffer, "%s == \"%s\"", ATTR_STATE, state_to_string(claimed_state)); if (diagnose) { printf ("Adding constraint [%s]\n", buffer); } query->addORConstraint (buffer); break; case MODE_STARTD_COD: sprintf (buffer, "%s > 0", ATTR_NUM_COD_CLAIMS ); if (diagnose) { printf ("Adding constraint [%s]\n", buffer); } query->addORConstraint (buffer); break; default: break; } if(javaMode) { sprintf( buffer, "%s == TRUE", ATTR_HAS_JAVA ); if (diagnose) { printf ("Adding constraint [%s]\n", buffer); } query->addANDConstraint (buffer); projList.AppendArg(ATTR_HAS_JAVA); projList.AppendArg(ATTR_JAVA_MFLOPS); projList.AppendArg(ATTR_JAVA_VENDOR); projList.AppendArg(ATTR_JAVA_VERSION); } if(offlineMode) { query->addANDConstraint( "size( OfflineUniverses ) != 0" ); projList.AppendArg( "OfflineUniverses" ); // // Since we can't add a regex to a projection, explicitly list all // the attributes we know about. // projList.AppendArg( "HasVM" ); projList.AppendArg( "VMOfflineReason" ); projList.AppendArg( "VMOfflineTime" ); } if(absentMode) { sprintf( buffer, "%s == TRUE", ATTR_ABSENT ); if (diagnose) { printf( "Adding constraint %s\n", buffer ); } query->addANDConstraint( buffer ); projList.AppendArg( ATTR_ABSENT ); projList.AppendArg( ATTR_LAST_HEARD_FROM ); projList.AppendArg( ATTR_CLASSAD_LIFETIME ); } if(vmMode) { sprintf( buffer, "%s == TRUE", ATTR_HAS_VM); if (diagnose) { printf ("Adding constraint [%s]\n", buffer); } query->addANDConstraint (buffer); projList.AppendArg(ATTR_VM_TYPE); projList.AppendArg(ATTR_VM_MEMORY); projList.AppendArg(ATTR_VM_NETWORKING); projList.AppendArg(ATTR_VM_NETWORKING_TYPES); projList.AppendArg(ATTR_VM_HARDWARE_VT); projList.AppendArg(ATTR_VM_AVAIL_NUM); projList.AppendArg(ATTR_VM_ALL_GUEST_MACS); projList.AppendArg(ATTR_VM_ALL_GUEST_IPS); projList.AppendArg(ATTR_VM_GUEST_MAC); projList.AppendArg(ATTR_VM_GUEST_IP); } // second pass: add regular parameters and constraints if (diagnose) { printf ("----------\n"); } secondPass (argc, argv); // initialize the totals object if (ppStyle == PP_CUSTOM && using_print_format) { if (pmHeadFoot & HF_NOSUMMARY) ppTotalStyle = PP_CUSTOM; } else { ppTotalStyle = ppStyle; } TrackTotals totals(ppTotalStyle); // fetch the query QueryResult q; if ((mode == MODE_STARTD_NORMAL) && (ppStyle == PP_STARTD_NORMAL)) { projList.AppendArg("Name"); projList.AppendArg("Machine"); projList.AppendArg("Opsys"); projList.AppendArg("Arch"); projList.AppendArg("State"); projList.AppendArg("Activity"); projList.AppendArg("LoadAvg"); projList.AppendArg("Memory"); projList.AppendArg("ActvtyTime"); projList.AppendArg("MyCurrentTime"); projList.AppendArg("EnteredCurrentActivity"); } else if( ppStyle == PP_VERBOSE ) { // Remove everything from the projection list if we're displaying // the "long form" of the ads. projList.Clear(); // but if -attributes was supplied, show only those attributes if ( ! dashAttributes.isEmpty()) { const char * s; dashAttributes.rewind(); while ((s = dashAttributes.next())) { projList.AppendArg(s); } } } if( projList.Count() > 0 ) { char **attr_list = projList.GetStringArray(); query->setDesiredAttrs(attr_list); deleteStringArray(attr_list); } // if diagnose was requested, just print the query ad if (diagnose) { ClassAd queryAd; // print diagnostic information about inferred internal state setMode ((Mode) 0, 0, NULL); setType (NULL, 0, NULL); setPPstyle ((ppOption) 0, 0, DEFAULT); printf ("----------\n"); q = query->getQueryAd (queryAd); fPrintAd (stdout, queryAd); printf ("----------\n"); fprintf (stderr, "Result of making query ad was: %d\n", q); exit (1); } // Address (host:port) is taken from requested pool, if given. char* addr = (NULL != pool) ? pool->addr() : NULL; Daemon* requested_daemon = pool; // If we're in "direct" mode, then we attempt to locate the daemon // associated with the requested subsystem (here encoded by value of mode) // In this case the host:port of pool (if given) denotes which // pool is being consulted if( direct ) { Daemon *d = NULL; switch( mode ) { case MODE_MASTER_NORMAL: d = new Daemon( DT_MASTER, direct, addr ); break; case MODE_STARTD_NORMAL: case MODE_STARTD_AVAIL: case MODE_STARTD_RUN: case MODE_STARTD_COD: d = new Daemon( DT_STARTD, direct, addr ); break; #ifdef HAVE_EXT_POSTGRESQL case MODE_QUILL_NORMAL: d = new Daemon( DT_QUILL, direct, addr ); break; #endif /* HAVE_EXT_POSTGRESQL */ case MODE_SCHEDD_NORMAL: case MODE_SCHEDD_SUBMITTORS: d = new Daemon( DT_SCHEDD, direct, addr ); break; case MODE_NEGOTIATOR_NORMAL: d = new Daemon( DT_NEGOTIATOR, direct, addr ); break; case MODE_CKPT_SRVR_NORMAL: case MODE_COLLECTOR_NORMAL: case MODE_LICENSE_NORMAL: case MODE_STORAGE_NORMAL: case MODE_GENERIC_NORMAL: case MODE_ANY_NORMAL: case MODE_OTHER: case MODE_GRID_NORMAL: case MODE_HAD_NORMAL: // These have to go to the collector, anyway. break; default: fprintf( stderr, "Error: Illegal mode %d\n", mode ); exit( 1 ); break; } // Here is where we actually override 'addr', if we can obtain // address of the requested daemon/subsys. If it can't be // located, then fail with error msg. // 'd' will be null (unset) if mode is one of above that must go to // collector (MODE_ANY_NORMAL, MODE_COLLECTOR_NORMAL, etc) if (NULL != d) { if( d->locate() ) { addr = d->addr(); requested_daemon = d; } else { const char* id = d->idStr(); if (NULL == id) id = d->name(); dprintf_WriteOnErrorBuffer(stderr, true); if (NULL == id) id = "daemon"; fprintf(stderr, "Error: Failed to locate %s\n", id); fprintf(stderr, "%s\n", d->error()); exit( 1 ); } } } ClassAdList result; CondorError errstack; if (NULL != ads_file) { MyString req; // query requirements q = query->getRequirements(req); const char * constraint = req.empty() ? NULL : req.c_str(); if (read_classad_file(ads_file, result, constraint)) { q = Q_OK; } } else if (NULL != addr) { // this case executes if pool was provided, or if in "direct" mode with // subsystem that corresponds to a daemon (above). // Here 'addr' represents either the host:port of requested pool, or // alternatively the host:port of daemon associated with requested subsystem (direct mode) q = query->fetchAds (result, addr, &errstack); } else { // otherwise obtain list of collectors and submit query that way CollectorList * collectors = CollectorList::create(); q = collectors->query (*query, result, &errstack); delete collectors; } // if any error was encountered during the query, report it and exit if (Q_OK != q) { dprintf_WriteOnErrorBuffer(stderr, true); // we can always provide these messages: fprintf( stderr, "Error: %s\n", getStrQueryResult(q) ); fprintf( stderr, "%s\n", errstack.getFullText(true).c_str() ); if ((NULL != requested_daemon) && ((Q_NO_COLLECTOR_HOST == q) || (requested_daemon->type() == DT_COLLECTOR))) { // Specific long message if connection to collector failed. const char* fullhost = requested_daemon->fullHostname(); if (NULL == fullhost) fullhost = "<unknown_host>"; const char* daddr = requested_daemon->addr(); if (NULL == daddr) daddr = "<unknown>"; char info[1000]; sprintf(info, "%s (%s)", fullhost, daddr); printNoCollectorContact( stderr, info, !expert ); } else if ((NULL != requested_daemon) && (Q_COMMUNICATION_ERROR == q)) { // more helpful message for failure to connect to some daemon/subsys const char* id = requested_daemon->idStr(); if (NULL == id) id = requested_daemon->name(); if (NULL == id) id = "daemon"; const char* daddr = requested_daemon->addr(); if (NULL == daddr) daddr = "<unknown>"; fprintf(stderr, "Error: Failed to contact %s at %s\n", id, daddr); } // fail exit (1); } if (noSort) { // do nothing } else if (sortSpecs.empty()) { // default classad sorting result.Sort((SortFunctionType)lessThanFunc); } else { // User requested custom sorting expressions: // insert attributes related to custom sorting result.Open(); while (ClassAd* ad = result.Next()) { for (vector<SortSpec>::iterator ss(sortSpecs.begin()); ss != sortSpecs.end(); ++ss) { ss->expr->SetParentScope(ad); classad::Value v; ss->expr->Evaluate(v); stringstream vs; // This will properly render all supported value types, // including undefined and error, although current semantic // pre-filters classads where sort expressions are undef/err: vs << ((v.IsStringValue())?"\"":"") << v << ((v.IsStringValue())?"\"":""); ad->AssignExpr(ss->keyAttr.c_str(), vs.str().c_str()); // Save the full expr in case user wants to examine on output: ad->AssignExpr(ss->keyExprAttr.c_str(), ss->arg.c_str()); } } result.Open(); result.Sort((SortFunctionType)customLessThanFunc); } // output result prettyPrint (result, &totals); delete query; return 0; }
void Defrag::poll() { dprintf(D_FULLDEBUG,"Evaluating defragmentation policy.\n"); // If we crash during this polling cycle, we will have saved // the time of the last poll, so the next cycle will be // scheduled on the false assumption that a cycle ran now. In // this way, we error on the side of draining too little // rather than too much. time_t now = time(NULL); time_t prev = m_last_poll; m_last_poll = now; saveState(); m_stats.Tick(); int num_to_drain = m_draining_per_poll; time_t last_hour = (prev / 3600)*3600; time_t current_hour = (now / 3600)*3600; time_t last_day = (prev / (3600*24))*3600*24; time_t current_day = (now / (3600*24))*3600*24; if( current_hour != last_hour ) { num_to_drain += prorate(m_draining_per_poll_hour,now-current_hour,3600,m_polling_interval); } if( current_day != last_day ) { num_to_drain += prorate(m_draining_per_poll_day,now-current_day,3600*24,m_polling_interval); } MachineSet draining_machines; int num_draining = countMachines(DRAINING_CONSTRAINT,"<InternalDrainingConstraint>", &draining_machines); m_stats.MachinesDraining = num_draining; MachineSet whole_machines; int num_whole_machines = countMachines(m_whole_machine_expr.c_str(),"DEFRAG_WHOLE_MACHINE_EXPR",&whole_machines); m_stats.WholeMachines = num_whole_machines; dprintf(D_ALWAYS,"There are currently %d draining and %d whole machines.\n", num_draining,num_whole_machines); // Calculate arrival rate of fully drained machines. This is a bit tricky because we poll. // We count by finding the newly-arrived // fully drained machines, and add to that count machines which are no-longer draining. // This allows us to find machines that have fully drained, but were then claimed between // polling cycles. MachineSet new_machines; MachineSet no_longer_whole_machines; // Find newly-arrived machines std::set_difference(whole_machines.begin(), whole_machines.end(), m_prev_whole_machines.begin(), m_prev_whole_machines.end(), std::inserter(new_machines, new_machines.begin())); // Now, newly-departed machines std::set_difference(m_prev_draining_machines.begin(), m_prev_draining_machines.end(), draining_machines.begin(), draining_machines.end(), std::inserter(no_longer_whole_machines, no_longer_whole_machines.begin())); dprintf_set("Set of current whole machines is ", &whole_machines); dprintf_set("Set of current draining machine is ", &draining_machines); dprintf_set("Newly Arrived whole machines is ", &new_machines); dprintf_set("Newly departed draining machines is ", &no_longer_whole_machines); m_prev_draining_machines = draining_machines; m_prev_whole_machines = whole_machines; int newly_drained = new_machines.size() + no_longer_whole_machines.size(); double arrival_rate = 0.0; // If there is an arrival... if (newly_drained > 0) { time_t current = time(0); // And it isn't the first one since defrag boot... if (m_last_whole_machine_arrival > 0) { m_whole_machines_arrived += newly_drained; time_t arrival_time = current - m_last_whole_machine_arrival; if (arrival_time < 1) arrival_time = 1; // very unlikely, but just in case m_whole_machine_arrival_sum += newly_drained * arrival_time; arrival_rate = newly_drained / ((double)arrival_time); dprintf(D_ALWAYS, "Arrival rate is %g machines/hour\n", arrival_rate * 3600.0); } m_last_whole_machine_arrival = current; } dprintf(D_ALWAYS, "Lifetime whole machines arrived: %d\n", m_whole_machines_arrived); if (m_whole_machine_arrival_sum > 0) { double lifetime_mean = m_whole_machines_arrived / m_whole_machine_arrival_sum; dprintf(D_ALWAYS, "Lifetime mean arrival rate: %g machines / hour\n", 3600.0 * lifetime_mean); if (newly_drained > 0) { double diff = arrival_rate - lifetime_mean; m_whole_machine_arrival_mean_squared += diff * diff; } double sd = sqrt(m_whole_machine_arrival_mean_squared / m_whole_machines_arrived); dprintf(D_ALWAYS, "Lifetime mean arrival rate sd: %g\n", sd * 3600); m_stats.MeanDrainedArrival = lifetime_mean; m_stats.MeanDrainedArrivalSD = sd; m_stats.DrainedMachines = m_whole_machines_arrived; } queryDrainingCost(); // If possible, cancel some drains. MachineSet cancelled_machines; poll_cancel(cancelled_machines); if( num_to_drain <= 0 ) { dprintf(D_ALWAYS,"Doing nothing, because number to drain in next %ds is calculated to be 0.\n", m_polling_interval); return; } if( (int)ceil(m_draining_per_hour) <= 0 ) { dprintf(D_ALWAYS,"Doing nothing, because DEFRAG_DRAINING_MACHINES_PER_HOUR=%f\n", m_draining_per_hour); return; } if( m_max_draining == 0 ) { dprintf(D_ALWAYS,"Doing nothing, because DEFRAG_MAX_CONCURRENT_DRAINING=0\n"); return; } if( m_max_whole_machines == 0 ) { dprintf(D_ALWAYS,"Doing nothing, because DEFRAG_MAX_WHOLE_MACHINES=0\n"); return; } if( m_max_draining >= 0 ) { if( num_draining >= m_max_draining ) { dprintf(D_ALWAYS,"Doing nothing, because DEFRAG_MAX_CONCURRENT_DRAINING=%d and there are %d draining machines.\n", m_max_draining, num_draining); return; } else if( num_draining < 0 ) { dprintf(D_ALWAYS,"Doing nothing, because DEFRAG_MAX_CONCURRENT_DRAINING=%d and the query to count draining machines failed.\n", m_max_draining); return; } } if( m_max_whole_machines >= 0 ) { if( num_whole_machines >= m_max_whole_machines ) { dprintf(D_ALWAYS,"Doing nothing, because DEFRAG_MAX_WHOLE_MACHINES=%d and there are %d whole machines.\n", m_max_whole_machines, num_whole_machines); return; } } // Even if m_max_whole_machines is -1 (infinite), we still need // the list of whole machines in order to filter them out in // the draining selection algorithm, so abort now if the // whole machine query failed. if( num_whole_machines < 0 ) { dprintf(D_ALWAYS,"Doing nothing, because the query to find whole machines failed.\n"); return; } dprintf(D_ALWAYS,"Looking for %d machines to drain.\n",num_to_drain); ClassAdList startdAds; std::string requirements; formatstr(requirements,"(%s) && Draining =!= true",m_defrag_requirements.c_str()); if( !queryMachines(requirements.c_str(),"DEFRAG_REQUIREMENTS",startdAds) ) { dprintf(D_ALWAYS,"Doing nothing, because the query to select machines matching DEFRAG_REQUIREMENTS failed.\n"); return; } startdAds.Shuffle(); startdAds.Sort(StartdSortFunc,&m_rank_ad); startdAds.Open(); int num_drained = 0; ClassAd *startd_ad_ptr; MachineSet machines_done; while( (startd_ad_ptr=startdAds.Next()) ) { ClassAd &startd_ad = *startd_ad_ptr; std::string machine; std::string name; startd_ad.LookupString(ATTR_NAME,name); slotNameToDaemonName(name,machine); // If we have already cancelled draining on this machine, ignore it for this cycle. if( cancelled_machines.count(machine) ) { dprintf(D_FULLDEBUG, "Skipping %s: already cancelled draining of %s in this cycle.\n", name.c_str(),machine.c_str()); continue; } if( machines_done.count(machine) ) { dprintf(D_FULLDEBUG, "Skipping %s: already attempted to drain %s in this cycle.\n", name.c_str(),machine.c_str()); continue; } if( whole_machines.count(machine) ) { dprintf(D_FULLDEBUG, "Skipping %s: because it is already running as a whole machine.\n", name.c_str()); continue; } if( drain(startd_ad) ) { machines_done.insert(machine); if( ++num_drained >= num_to_drain ) { dprintf(D_ALWAYS, "Drained maximum number of machines allowed in this cycle (%d).\n", num_to_drain); break; } } } startdAds.Close(); dprintf(D_ALWAYS,"Drained %d machines (wanted to drain %d machines).\n", num_drained,num_to_drain); dprintf(D_FULLDEBUG,"Done evaluating defragmentation policy.\n"); }