void Defrag::poll_cancel(MachineSet &cancelled_machines) { if (!m_cancel_requirements.size()) { return; } MachineSet draining_whole_machines; std::stringstream draining_whole_machines_ss; draining_whole_machines_ss << "(" << m_cancel_requirements << ") && (" << DRAINING_CONSTRAINT << ")"; int num_draining_whole_machines = countMachines(draining_whole_machines_ss.str().c_str(), "<DEFRAG_CANCEL_REQUIREMENTS>", &draining_whole_machines); if (num_draining_whole_machines) { dprintf(D_ALWAYS, "Of the whole machines, %d are in the draining state.\n", num_draining_whole_machines); } else { // Early exit: nothing to do. return; } ClassAdList startdAds; if (!queryMachines(DRAINING_CONSTRAINT, "DRAINING_CONSTRAINT <all draining slots>",startdAds)) { return; } startdAds.Shuffle(); startdAds.Sort(StartdSortFunc,&m_rank_ad); startdAds.Open(); unsigned int cancel_count = 0; ClassAd *startd_ad_ptr; while ( (startd_ad_ptr=startdAds.Next()) ) { if (!startd_ad_ptr) continue; ClassAd &startd_ad = *startd_ad_ptr; std::string machine; std::string name; startd_ad.LookupString(ATTR_NAME,name); slotNameToDaemonName(name,machine); if( !cancelled_machines.count(machine) && draining_whole_machines.count(machine) ) { cancel_drain(startd_ad); cancelled_machines.insert(machine); cancel_count ++; } } startdAds.Close(); dprintf(D_ALWAYS, "Cancelled draining of %u whole machines.\n", cancel_count); }
void Defrag::poll() { dprintf(D_FULLDEBUG,"Evaluating defragmentation policy.\n"); // If we crash during this polling cycle, we will have saved // the time of the last poll, so the next cycle will be // scheduled on the false assumption that a cycle ran now. In // this way, we error on the side of draining too little // rather than too much. time_t now = time(NULL); time_t prev = m_last_poll; m_last_poll = now; saveState(); m_stats.Tick(); int num_to_drain = m_draining_per_poll; time_t last_hour = (prev / 3600)*3600; time_t current_hour = (now / 3600)*3600; time_t last_day = (prev / (3600*24))*3600*24; time_t current_day = (now / (3600*24))*3600*24; if( current_hour != last_hour ) { num_to_drain += prorate(m_draining_per_poll_hour,now-current_hour,3600,m_polling_interval); } if( current_day != last_day ) { num_to_drain += prorate(m_draining_per_poll_day,now-current_day,3600*24,m_polling_interval); } int num_draining = countMachines(DRAINING_CONSTRAINT,"<InternalDrainingConstraint>"); m_stats.MachinesDraining = num_draining; MachineSet whole_machines; int num_whole_machines = countMachines(m_whole_machine_expr.c_str(),"DEFRAG_WHOLE_MACHINE_EXPR",&whole_machines); m_stats.WholeMachines = num_whole_machines; dprintf(D_ALWAYS,"There are currently %d draining and %d whole machines.\n", num_draining,num_whole_machines); queryDrainingCost(); // If possible, cancel some drains. MachineSet cancelled_machines; poll_cancel(cancelled_machines); if( num_to_drain <= 0 ) { dprintf(D_ALWAYS,"Doing nothing, because number to drain in next %ds is calculated to be 0.\n", m_polling_interval); return; } if( (int)ceil(m_draining_per_hour) <= 0 ) { dprintf(D_ALWAYS,"Doing nothing, because DEFRAG_DRAINING_MACHINES_PER_HOUR=%f\n", m_draining_per_hour); return; } if( m_max_draining == 0 ) { dprintf(D_ALWAYS,"Doing nothing, because DEFRAG_MAX_CONCURRENT_DRAINING=0\n"); return; } if( m_max_whole_machines == 0 ) { dprintf(D_ALWAYS,"Doing nothing, because DEFRAG_MAX_WHOLE_MACHINES=0\n"); return; } if( m_max_draining >= 0 ) { if( num_draining >= m_max_draining ) { dprintf(D_ALWAYS,"Doing nothing, because DEFRAG_MAX_CONCURRENT_DRAINING=%d and there are %d draining machines.\n", m_max_draining, num_draining); return; } else if( num_draining < 0 ) { dprintf(D_ALWAYS,"Doing nothing, because DEFRAG_MAX_CONCURRENT_DRAINING=%d and the query to count draining machines failed.\n", m_max_draining); return; } } if( m_max_whole_machines >= 0 ) { if( num_whole_machines >= m_max_whole_machines ) { dprintf(D_ALWAYS,"Doing nothing, because DEFRAG_MAX_WHOLE_MACHINES=%d and there are %d whole machines.\n", m_max_whole_machines, num_whole_machines); return; } } // Even if m_max_whole_machines is -1 (infinite), we still need // the list of whole machines in order to filter them out in // the draining selection algorithm, so abort now if the // whole machine query failed. if( num_whole_machines < 0 ) { dprintf(D_ALWAYS,"Doing nothing, because the query to find whole machines failed.\n"); return; } dprintf(D_ALWAYS,"Looking for %d machines to drain.\n",num_to_drain); ClassAdList startdAds; std::string requirements; sprintf(requirements,"(%s) && Draining =!= true",m_defrag_requirements.c_str()); if( !queryMachines(requirements.c_str(),"DEFRAG_REQUIREMENTS",startdAds) ) { dprintf(D_ALWAYS,"Doing nothing, because the query to select machines matching DEFRAG_REQUIREMENTS failed.\n"); return; } startdAds.Shuffle(); startdAds.Sort(StartdSortFunc,&m_rank_ad); startdAds.Open(); int num_drained = 0; ClassAd *startd_ad_ptr; MachineSet machines_done; while( (startd_ad_ptr=startdAds.Next()) ) { if (!startd_ad_ptr) continue; ClassAd &startd_ad = *startd_ad_ptr; std::string machine; std::string name; startd_ad.LookupString(ATTR_NAME,name); slotNameToDaemonName(name,machine); // If we have already cancelled draining on this machine, ignore it for this cycle. if( cancelled_machines.count(machine) ) { dprintf(D_FULLDEBUG, "Skipping %s: already cancelled draining of %s in this cycle.\n", name.c_str(),machine.c_str()); continue; } if( machines_done.count(machine) ) { dprintf(D_FULLDEBUG, "Skipping %s: already attempted to drain %s in this cycle.\n", name.c_str(),machine.c_str()); continue; } if( whole_machines.count(machine) ) { dprintf(D_FULLDEBUG, "Skipping %s: because it is already running as a whole machine.\n", name.c_str()); continue; } if( drain(startd_ad) ) { machines_done.insert(machine); if( ++num_drained >= num_to_drain ) { dprintf(D_ALWAYS, "Drained maximum number of machines allowed in this cycle (%d).\n", num_to_drain); break; } } } startdAds.Close(); dprintf(D_ALWAYS,"Drained %d machines (wanted to drain %d machines).\n", num_drained,num_drained); dprintf(D_FULLDEBUG,"Done evaluating defragmentation policy.\n"); }
void Defrag::poll() { dprintf(D_FULLDEBUG,"Evaluating defragmentation policy.\n"); // If we crash during this polling cycle, we will have saved // the time of the last poll, so the next cycle will be // scheduled on the false assumption that a cycle ran now. In // this way, we error on the side of draining too little // rather than too much. time_t now = time(NULL); time_t prev = m_last_poll; m_last_poll = now; saveState(); m_stats.Tick(); int num_to_drain = m_draining_per_poll; time_t last_hour = (prev / 3600)*3600; time_t current_hour = (now / 3600)*3600; time_t last_day = (prev / (3600*24))*3600*24; time_t current_day = (now / (3600*24))*3600*24; if( current_hour != last_hour ) { num_to_drain += prorate(m_draining_per_poll_hour,now-current_hour,3600,m_polling_interval); } if( current_day != last_day ) { num_to_drain += prorate(m_draining_per_poll_day,now-current_day,3600*24,m_polling_interval); } MachineSet draining_machines; int num_draining = countMachines(DRAINING_CONSTRAINT,"<InternalDrainingConstraint>", &draining_machines); m_stats.MachinesDraining = num_draining; MachineSet whole_machines; int num_whole_machines = countMachines(m_whole_machine_expr.c_str(),"DEFRAG_WHOLE_MACHINE_EXPR",&whole_machines); m_stats.WholeMachines = num_whole_machines; dprintf(D_ALWAYS,"There are currently %d draining and %d whole machines.\n", num_draining,num_whole_machines); // Calculate arrival rate of fully drained machines. This is a bit tricky because we poll. // We count by finding the newly-arrived // fully drained machines, and add to that count machines which are no-longer draining. // This allows us to find machines that have fully drained, but were then claimed between // polling cycles. MachineSet new_machines; MachineSet no_longer_whole_machines; // Find newly-arrived machines std::set_difference(whole_machines.begin(), whole_machines.end(), m_prev_whole_machines.begin(), m_prev_whole_machines.end(), std::inserter(new_machines, new_machines.begin())); // Now, newly-departed machines std::set_difference(m_prev_draining_machines.begin(), m_prev_draining_machines.end(), draining_machines.begin(), draining_machines.end(), std::inserter(no_longer_whole_machines, no_longer_whole_machines.begin())); dprintf_set("Set of current whole machines is ", &whole_machines); dprintf_set("Set of current draining machine is ", &draining_machines); dprintf_set("Newly Arrived whole machines is ", &new_machines); dprintf_set("Newly departed draining machines is ", &no_longer_whole_machines); m_prev_draining_machines = draining_machines; m_prev_whole_machines = whole_machines; int newly_drained = new_machines.size() + no_longer_whole_machines.size(); double arrival_rate = 0.0; // If there is an arrival... if (newly_drained > 0) { time_t current = time(0); // And it isn't the first one since defrag boot... if (m_last_whole_machine_arrival > 0) { m_whole_machines_arrived += newly_drained; time_t arrival_time = current - m_last_whole_machine_arrival; if (arrival_time < 1) arrival_time = 1; // very unlikely, but just in case m_whole_machine_arrival_sum += newly_drained * arrival_time; arrival_rate = newly_drained / ((double)arrival_time); dprintf(D_ALWAYS, "Arrival rate is %g machines/hour\n", arrival_rate * 3600.0); } m_last_whole_machine_arrival = current; } dprintf(D_ALWAYS, "Lifetime whole machines arrived: %d\n", m_whole_machines_arrived); if (m_whole_machine_arrival_sum > 0) { double lifetime_mean = m_whole_machines_arrived / m_whole_machine_arrival_sum; dprintf(D_ALWAYS, "Lifetime mean arrival rate: %g machines / hour\n", 3600.0 * lifetime_mean); if (newly_drained > 0) { double diff = arrival_rate - lifetime_mean; m_whole_machine_arrival_mean_squared += diff * diff; } double sd = sqrt(m_whole_machine_arrival_mean_squared / m_whole_machines_arrived); dprintf(D_ALWAYS, "Lifetime mean arrival rate sd: %g\n", sd * 3600); m_stats.MeanDrainedArrival = lifetime_mean; m_stats.MeanDrainedArrivalSD = sd; m_stats.DrainedMachines = m_whole_machines_arrived; } queryDrainingCost(); // If possible, cancel some drains. MachineSet cancelled_machines; poll_cancel(cancelled_machines); if( num_to_drain <= 0 ) { dprintf(D_ALWAYS,"Doing nothing, because number to drain in next %ds is calculated to be 0.\n", m_polling_interval); return; } if( (int)ceil(m_draining_per_hour) <= 0 ) { dprintf(D_ALWAYS,"Doing nothing, because DEFRAG_DRAINING_MACHINES_PER_HOUR=%f\n", m_draining_per_hour); return; } if( m_max_draining == 0 ) { dprintf(D_ALWAYS,"Doing nothing, because DEFRAG_MAX_CONCURRENT_DRAINING=0\n"); return; } if( m_max_whole_machines == 0 ) { dprintf(D_ALWAYS,"Doing nothing, because DEFRAG_MAX_WHOLE_MACHINES=0\n"); return; } if( m_max_draining >= 0 ) { if( num_draining >= m_max_draining ) { dprintf(D_ALWAYS,"Doing nothing, because DEFRAG_MAX_CONCURRENT_DRAINING=%d and there are %d draining machines.\n", m_max_draining, num_draining); return; } else if( num_draining < 0 ) { dprintf(D_ALWAYS,"Doing nothing, because DEFRAG_MAX_CONCURRENT_DRAINING=%d and the query to count draining machines failed.\n", m_max_draining); return; } } if( m_max_whole_machines >= 0 ) { if( num_whole_machines >= m_max_whole_machines ) { dprintf(D_ALWAYS,"Doing nothing, because DEFRAG_MAX_WHOLE_MACHINES=%d and there are %d whole machines.\n", m_max_whole_machines, num_whole_machines); return; } } // Even if m_max_whole_machines is -1 (infinite), we still need // the list of whole machines in order to filter them out in // the draining selection algorithm, so abort now if the // whole machine query failed. if( num_whole_machines < 0 ) { dprintf(D_ALWAYS,"Doing nothing, because the query to find whole machines failed.\n"); return; } dprintf(D_ALWAYS,"Looking for %d machines to drain.\n",num_to_drain); ClassAdList startdAds; std::string requirements; formatstr(requirements,"(%s) && Draining =!= true",m_defrag_requirements.c_str()); if( !queryMachines(requirements.c_str(),"DEFRAG_REQUIREMENTS",startdAds) ) { dprintf(D_ALWAYS,"Doing nothing, because the query to select machines matching DEFRAG_REQUIREMENTS failed.\n"); return; } startdAds.Shuffle(); startdAds.Sort(StartdSortFunc,&m_rank_ad); startdAds.Open(); int num_drained = 0; ClassAd *startd_ad_ptr; MachineSet machines_done; while( (startd_ad_ptr=startdAds.Next()) ) { ClassAd &startd_ad = *startd_ad_ptr; std::string machine; std::string name; startd_ad.LookupString(ATTR_NAME,name); slotNameToDaemonName(name,machine); // If we have already cancelled draining on this machine, ignore it for this cycle. if( cancelled_machines.count(machine) ) { dprintf(D_FULLDEBUG, "Skipping %s: already cancelled draining of %s in this cycle.\n", name.c_str(),machine.c_str()); continue; } if( machines_done.count(machine) ) { dprintf(D_FULLDEBUG, "Skipping %s: already attempted to drain %s in this cycle.\n", name.c_str(),machine.c_str()); continue; } if( whole_machines.count(machine) ) { dprintf(D_FULLDEBUG, "Skipping %s: because it is already running as a whole machine.\n", name.c_str()); continue; } if( drain(startd_ad) ) { machines_done.insert(machine); if( ++num_drained >= num_to_drain ) { dprintf(D_ALWAYS, "Drained maximum number of machines allowed in this cycle (%d).\n", num_to_drain); break; } } } startdAds.Close(); dprintf(D_ALWAYS,"Drained %d machines (wanted to drain %d machines).\n", num_drained,num_to_drain); dprintf(D_FULLDEBUG,"Done evaluating defragmentation policy.\n"); }