bool ScheddNegotiate::getSatisfaction() { if( m_jobs_rejected > 0 ) { return false; } // no jobs were explicitly rejected, but did negotiation end // before we presented all of our jobs? if( m_current_job_id.cluster == -1 ) { nextJob(); } if( m_current_job_id.cluster == -1 ) { return true; // no more jobs } return false; }
bool ScheddNegotiate::sendJobInfo(Sock *sock) { // The Negotiator wants us to send it a job. sock->encode(); if( m_current_job_id.cluster == -1 && !nextJob() ) { if( !sock->snd_int(NO_MORE_JOBS,TRUE) ) { dprintf( D_ALWAYS, "Can't send NO_MORE_JOBS to mgr\n" ); return false; } m_negotiation_finished = true; return true; } if( !sock->put(JOB_INFO) ) { dprintf( D_ALWAYS, "Can't send JOB_INFO to mgr\n" ); return false; } // request match diagnostics m_current_job_ad.Assign(ATTR_WANT_MATCH_DIAGNOSTICS, true); // Send the ad to the negotiator if( !putClassAd(sock, m_current_job_ad) ) { dprintf( D_ALWAYS, "Can't send job ad to mgr\n" ); sock->end_of_message(); return false; } if( !sock->end_of_message() ) { dprintf( D_ALWAYS, "Can't send job eom to mgr\n" ); return false; } m_current_resources_delivered = 0; m_current_resources_requested = 1; m_current_job_ad.LookupInteger(ATTR_RESOURCE_REQUEST_COUNT,m_current_resources_requested); dprintf( D_FULLDEBUG, "Sent job %d.%d (autocluster=%d resources_requested=%d) to the negotiator\n", m_current_job_id.cluster, m_current_job_id.proc, m_current_auto_cluster_id, m_current_resources_requested ); return true; }
bool ScheddNegotiate::sendResourceRequestList(Sock *sock) { m_jobs_can_offer = scheduler_maxJobsToOffer(); while (m_num_resource_reqs_to_send > 0) { nextJob(); if ( !sendJobInfo(sock, true) ) { return false; } // If m_negotiation_finished==true, then no more jobs to send. But // if we already sent some jobs in response to this request, we // don't want to consider the negotitation finished since we still want // to receive responses (e.g. matches) back from the negotiator. if ( m_negotiation_finished ) { if (m_num_resource_reqs_sent > 0 ) { m_negotiation_finished = false; } break; } // When we call sendJobInfo next at the top of the loop, // we don't want it to send all the individual jobs in the current cluster since // we already sent an ad with a resource_request_count. So we want // to skip ahead to the next cluster. if ( !m_jobs->empty() ) { ResourceRequestCluster *cluster = m_jobs->front(); m_jobs->pop_front(); delete cluster; } m_num_resource_reqs_sent++; m_num_resource_reqs_to_send--; extern void IncrementResourceRequestsSent(); IncrementResourceRequestsSent(); } // Set m_num_resource_reqs_to_send to zero, as we are not sending // any more reqs now, and this counter is inspected in nextJob() m_num_resource_reqs_to_send = 0; return true; }
void AvatarJobManager::runJob() { QMutexLocker(&mutex()); if (IsJobRunning) return; if (!hasJob()) return; IsJobRunning = true; Contact contact = nextJob(); AvatarJobRunner *runner = new AvatarJobRunner(contact, this); connect(runner, SIGNAL(jobFinished(bool)), this, SLOT(jobFinished())); runner->runJob(); }
DCMsg::MessageClosureEnum ScheddNegotiate::messageReceived( DCMessenger *messenger, Sock *sock ) { // This is called when readMsg() returns true. // Now carry out the negotiator's request that we just read. switch( m_operation ) { case REJECTED: m_reject_reason = "Unknown reason"; case REJECTED_WITH_REASON: scheduler_handleJobRejected( m_current_job_id, m_reject_reason.c_str() ); m_jobs_rejected++; setAutoClusterRejected( m_current_auto_cluster_id ); nextJob(); break; case SEND_JOB_INFO: if( !sendJobInfo(sock) ) { // We failed to talk to the negotiator, so close the socket. return MESSAGE_FINISHED; } break; case PERMISSION_AND_AD: { // If the slot we matched is partitionable, edit it // so it will look like the resulting dynamic slot. // NOTE: Seems like we no longer need to do this here, // since we also do the fixup at claim time in // contactStartd(). - Todd 1/12 <*****@*****.**> if( !fixupPartitionableSlot(&m_current_job_ad,&m_match_ad) ) { nextJob(); break; } std::string slot_name_buf; m_match_ad.LookupString(ATTR_NAME,slot_name_buf); char const *slot_name = slot_name_buf.c_str(); int offline = false; m_match_ad.EvalBool(ATTR_OFFLINE,NULL,offline); if( offline ) { dprintf(D_ALWAYS,"Job %d.%d matched to offline machine %s.\n", m_current_job_id.cluster,m_current_job_id.proc,slot_name); nextJob(); break; } if( scheduler_handleMatch(m_current_job_id,m_claim_id.c_str(),m_match_ad,slot_name) ) { m_jobs_matched++; } nextJob(); break; } case END_NEGOTIATE: dprintf( D_ALWAYS, "Lost priority - %d jobs matched\n", m_jobs_matched ); m_negotiation_finished = true; break; default: EXCEPT("should never get here (negotiation op %d)",m_operation); } if( m_negotiation_finished ) { // the following function takes ownership of sock scheduler_handleNegotiationFinished( sock ); sock = NULL; } else { // wait for negotiator to write a response messenger->startReceiveMsg( this, sock ); } // By returning MESSAGE_CONTINUING, we tell messenger not to // close the socket. Either we have finished negotiating and // sock has been taken care of by the scheduler (e.g. by // registering it to wait for the next NEGOTIATE command), or // we are not yet done with negotiating and we are waiting for // the next operation within the current negotiation round. return MESSAGE_CONTINUING; }
DCMsg::MessageClosureEnum ScheddNegotiate::messageReceived( DCMessenger *messenger, Sock *sock ) { // This is called when readMsg() returns true. // Now carry out the negotiator's request that we just read. switch( m_operation ) { case REJECTED: m_reject_reason = "Unknown reason"; case REJECTED_WITH_REASON: { // To support resource request lists, the // reject reason may end with "...|autocluster|cluster.proc|" // if so, reset m_current_auto_cluster_id and m_current_job_id // with the values contained in the reject reason, and truncate // this information out of m_reject_reason. int pos = m_reject_reason.FindChar('|'); if ( pos >= 0 ) { m_reject_reason.Tokenize(); /*const char *reason =*/ m_reject_reason.GetNextToken("|",false); const char *ac = m_reject_reason.GetNextToken("|",false); const char *jobid = m_reject_reason.GetNextToken("|",false); if (ac && jobid) { int rr_cluster, rr_proc; m_current_auto_cluster_id = atoi(ac); StrToProcId(jobid,rr_cluster,rr_proc); if (rr_cluster != m_current_job_id.cluster || rr_proc != m_current_job_id.proc) { m_current_resources_delivered = 0; } m_current_job_id.cluster = rr_cluster; m_current_job_id.proc = rr_proc; } m_reject_reason.setChar(pos,'\0'); // will truncate string at pos } scheduler_handleJobRejected( m_current_job_id, m_reject_reason.c_str() ); m_jobs_rejected++; setAutoClusterRejected( m_current_auto_cluster_id ); nextJob(); break; } case SEND_JOB_INFO: m_num_resource_reqs_sent = 0; // clear counter of reqs sent this round if( !sendJobInfo(sock) ) { // We failed to talk to the negotiator, so close the socket. return MESSAGE_FINISHED; } break; case SEND_RESOURCE_REQUEST_LIST: m_num_resource_reqs_sent = 0; // clear counter of reqs sent this round if( !sendResourceRequestList(sock) ) { // We failed to talk to the negotiator, so close the socket. return MESSAGE_FINISHED; } break; case PERMISSION_AND_AD: { // When using request lists, one single // "m_current_job_id" is kinda meaningless if we just sent a whole // pile of jobs to the negotiator. So we want to // reset m_current_job_id with the job id info embedded in the offer // that comes back from the negotiator (if it exists). This will // happen with an 8.3.0+ negotiator, and is needed when using // resource request lists. int rr_cluster = -1; int rr_proc = -1; m_match_ad.LookupInteger(ATTR_RESOURCE_REQUEST_CLUSTER, rr_cluster); m_match_ad.LookupInteger(ATTR_RESOURCE_REQUEST_PROC, rr_proc); if (rr_cluster != -1 && rr_proc != -1) { if (rr_cluster != m_current_job_id.cluster || rr_proc != m_current_job_id.proc) { m_current_resources_delivered = 0; } m_current_job_id.cluster = rr_cluster; m_current_job_id.proc = rr_proc; } m_current_resources_delivered++; std::string slot_name_buf; m_match_ad.LookupString(ATTR_NAME,slot_name_buf); char const *slot_name = slot_name_buf.c_str(); int offline = false; m_match_ad.EvalBool(ATTR_OFFLINE,NULL,offline); if( offline ) { dprintf(D_ALWAYS,"Job %d.%d (delivered=%d) matched to offline machine %s.\n", m_current_job_id.cluster,m_current_job_id.proc,m_current_resources_delivered,slot_name); nextJob(); break; } if( scheduler_handleMatch(m_current_job_id,m_claim_id.c_str(),m_extra_claims.c_str(), m_match_ad,slot_name) ) { m_jobs_matched++; } nextJob(); break; } case END_NEGOTIATE: dprintf( D_ALWAYS, "Lost priority - %d jobs matched\n", m_jobs_matched ); m_negotiation_finished = true; break; default: EXCEPT("should never get here (negotiation op %d)",m_operation); } // end of switch on m_operation if( m_negotiation_finished ) { // the following function takes ownership of sock scheduler_handleNegotiationFinished( sock ); sock = NULL; } else { // wait for negotiator to write a response messenger->startReceiveMsg( this, sock ); } // By returning MESSAGE_CONTINUING, we tell messenger not to // close the socket. Either we have finished negotiating and // sock has been taken care of by the scheduler (e.g. by // registering it to wait for the next NEGOTIATE command), or // we are not yet done with negotiating and we are waiting for // the next operation within the current negotiation round. return MESSAGE_CONTINUING; }
bool ScheddNegotiate::sendJobInfo(Sock *sock, bool just_sig_attrs) { // The Negotiator wants us to send it a job. sock->encode(); if( m_current_job_id.cluster == -1 && !nextJob() ) { if( !sock->snd_int(NO_MORE_JOBS,TRUE) ) { dprintf( D_ALWAYS, "Can't send NO_MORE_JOBS to mgr\n" ); return false; } m_negotiation_finished = true; return true; } if( !sock->put(JOB_INFO) ) { dprintf( D_ALWAYS, "Can't send JOB_INFO to mgr\n" ); return false; } // If schedd wants pslot preemption, advertise here m_current_job_ad.Assign(ATTR_WANT_PSLOT_PREEMPTION, param_boolean("ALLOW_PSLOT_PREEMPTION", false)); // request match diagnostics // 0 = no match diagnostics // 1 = match diagnostics string // 2 = match diagnostics string decorated w/ autocluster + jobid m_current_job_ad.Assign(ATTR_WANT_MATCH_DIAGNOSTICS, (int) 2); m_current_job_ad.Assign(ATTR_WANT_PSLOT_PREEMPTION, param_boolean("ALLOW_PSLOT_PREEMPTION", false)); // Send the ad to the negotiator int putad_result = 0; std::string auto_cluster_attrs; if ( just_sig_attrs && m_current_job_ad.LookupString(ATTR_AUTO_CLUSTER_ATTRS, auto_cluster_attrs) ) { // don't send the entire job ad; just send significant attrs classad::References sig_attrs; StringTokenIterator list(auto_cluster_attrs); const std::string *attr; while ((attr = list.next_string())) { sig_attrs.insert(*attr); } // besides significant attrs, we also always want to send these attrs cuz // the matchmaker explicitly looks for them (for dprintfs or whatever). sig_attrs.insert(ATTR_OWNER); sig_attrs.insert(ATTR_CLUSTER_ID); sig_attrs.insert(ATTR_PROC_ID); sig_attrs.insert(ATTR_RESOURCE_REQUEST_COUNT); sig_attrs.insert(ATTR_GLOBAL_JOB_ID); sig_attrs.insert(ATTR_AUTO_CLUSTER_ID); sig_attrs.insert(ATTR_WANT_MATCH_DIAGNOSTICS); sig_attrs.insert(ATTR_WANT_PSLOT_PREEMPTION); sig_attrs.insert(ATTR_WANT_CLAIMING); // used for Condor-G matchmaking // ship it! putad_result = putClassAd(sock, m_current_job_ad, 0, &sig_attrs); } else { // send the entire classad. perhaps we are doing this because the // ad does not have ATTR_AUTO_CLUSTER_ATTRS defined for some reason, // or perhaps we are doing this because we were explicitly told to do so. putad_result = putClassAd(sock, m_current_job_ad); } if( !putad_result ) { dprintf( D_ALWAYS, "Can't send job ad to mgr\n" ); sock->end_of_message(); return false; } if( !sock->end_of_message() ) { dprintf( D_ALWAYS, "Can't send job eom to mgr\n" ); return false; } m_current_resources_delivered = 0; m_current_resources_requested = 1; m_current_job_ad.LookupInteger(ATTR_RESOURCE_REQUEST_COUNT,m_current_resources_requested); dprintf( D_FULLDEBUG, "Sent job %d.%d (autocluster=%d resources_requested=%d) to the negotiator\n", m_current_job_id.cluster, m_current_job_id.proc, m_current_auto_cluster_id, m_current_resources_requested ); return true; }
// Boilerplate program options code from http://www.radmangames.com/programming/how-to-use-boost-program_options int main(int argc, char** argv) { bool enable_sorting = true; double seed = 0; double power_mean = 300.0; // Watts double power_stdev = power_mean/10.0; double power_estimate_error_stdev = power_stdev/10.0; double arrival_rate = 1000.0; // Jobs per second double completion_time_mean = 0.9*NUM_SERVERS/arrival_rate; // Seconds double completion_time_stdev = completion_time_mean/10.0; double sorting_time_min = completion_time_mean/1000.0; double sorting_time_max = sorting_time_min*2.0; double routing_time_min = sorting_time_min; double routing_time_max = sorting_time_max; try { /** Define and parse the program options */ namespace po = boost::program_options; po::options_description desc("Options"); desc.add_options() ("help", "Print help messages") ("seed", po::value<double>(&seed), "Seed for random number generator") ("power_estimate_error_stdev", po::value<double>(&power_estimate_error_stdev), "Power estimate standard deviation") ("completion_time_stdev", po::value<double>(&completion_time_stdev), "Completion time standard deviation") ("enable_sorting", po::value<bool>(&enable_sorting), "Enable sorting") ; po::variables_map vm; try { po::store(po::parse_command_line(argc, argv, desc), vm); // can throw /** --help option */ if ( vm.count("help") ) { std::cout << "Basic Command Line Parameter App" << std::endl << desc << std::endl; return SUCCESS; } po::notify(vm); // throws on error, so do after help in case // there are any problems } catch(po::error& e) { std::cerr << "ERROR: " << e.what() << std::endl << std::endl; std::cerr << desc << std::endl; return ERROR_IN_COMMAND_LINE; } // BEGIN APPLICATION CODE // DataCenterRandomPtr rand(new DataCenterRandom( seed, power_mean, power_stdev, arrival_rate, completion_time_mean, completion_time_stdev, sorting_time_min, sorting_time_max, routing_time_min, routing_time_max, power_estimate_error_stdev)); PriorityTypePtr sortOrder(new JobEvent::PriorityType(JobEvent::TIME)); PriorityQueueEventListPtr eventList(new PriorityQueueEventList( EVENT_LIST_LEN, sortOrder)); PriorityQueueWorkingServers::SortingDomain sortingDomain; if(enable_sorting){ sortingDomain = PriorityQueueWorkingServers::POWER_AWARE; } else{ sortingDomain = PriorityQueueWorkingServers::RANDOM; } std::ostringstream s; s << seed; AccumulatorStatistics statistics(s.str()); PriorityQueueWorkingServersPtr workingServersQueue(new PriorityQueueWorkingServers( NUM_SERVERS, rand, eventList, statistics.getAccumulator(AccumulatorStatistics::LATENCY), statistics.getAccumulator(AccumulatorStatistics::TOTAL_ENERGY), sortOrder, s.str() + "server_currents.csv", sortingDomain)); PriorityQueueJobSorterPtr sortedJobQueue(new PriorityQueueJobSorter( SORTED_JOBS_LIST_LEN, rand, workingServersQueue, eventList, sortOrder, enable_sorting)); QueueJobBufferPtr unsortedJobQueue(new QueueJobBuffer( UNSORTED_JOBS_LIST_LEN, statistics.getAccumulator(AccumulatorStatistics::TIME_BETWEEN_REJECTED_JOBS), sortedJobQueue)); #ifndef UNITTEST double time = 0; JobEventPtr arrival(new JobEvent(0, Event::JOB_ARRIVAL, sortOrder)); _NOTEL(0,"Welcome to the data center stacked server simulator."); _LOGL(1,"Initialization parameters: "); _LOGL(1,"Sorting enabled: " << enable_sorting); _LOGL(1,"Simulation time: " << MAX_TIME); _LOGL(1,"Event list length: " << EVENT_LIST_LEN); _LOGL(1,"Unsorted jobs list length: " << UNSORTED_JOBS_LIST_LEN); _LOGL(1,"Sorted jobs list length: " << SORTED_JOBS_LIST_LEN); _LOGL(1,"Number of servers: " << NUM_SERVERS); _LOGL(1, *rand); // Queue up initial arrival. _NOTEL(2,"Creating initial arrival event."); eventList->enqueue(arrival); while(time < MAX_TIME){ EventPtr e = eventList->dequeue(); time = e->time; _NOTEL(2, time); if(e->type == Event::JOB_ARRIVAL || e->type == Event::JOB_FINISHED){ JobEventPtr job = boost::static_pointer_cast<JobEvent>(e); if(job->type == Event::JOB_ARRIVAL){ double t = time + rand->sample_arrivalTime(); _NOTEL(2,"Processing job arrival event. Scheduling next job arrival for time " << t); JobEventPtr nextJob(new JobEvent(t, Event::JOB_ARRIVAL, sortOrder)); eventList->enqueue(nextJob); if(!unsortedJobQueue->enqueue(job)){ if(!sortedJobQueue->enqueue(job,time)){ workingServersQueue->enqueue(job,time); } } } else{ // if(job->type == Event::JOB_FINISHED){ _NOTEL(2,"Processing job removal event."); workingServersQueue->remove(job,time); if(enable_sorting){ if(!sortedJobQueue->is_busy() && !sortedJobQueue->is_empty()){ JobEventPtr job = sortedJobQueue->dequeueJob(); workingServersQueue->enqueue(job,time); } } else{ if(!unsortedJobQueue->is_empty()){ JobEventPtr job = unsortedJobQueue->dequeue(); workingServersQueue->enqueue(job,time); } } } } else if(e->type == Event::SORTED_QUEUE_READY){ if(enable_sorting){ _NOTEL(2,"Processing sorted queue ready event."); sortedJobQueue->reset_busy(); if(!sortedJobQueue->is_empty() && !workingServersQueue->is_busy() && !workingServersQueue->is_full()){ JobEventPtr job = sortedJobQueue->dequeueJob(); workingServersQueue->enqueue(job,time); } if(!unsortedJobQueue->is_empty()){ JobEventPtr job = unsortedJobQueue->dequeue(); if(!sortedJobQueue->enqueue(job,time)){ workingServersQueue->enqueue(job,time); } } else{ _NOTEL(2,"Nothing for sorted queue to do."); } } else{ _NOTEL(0,"PROBLEM!!!"); } } else{ // if(e->type == Event::WORKING_SERVERS_QUEUE_READY){ _NOTEL(2,"Processing working servers queue ready event."); workingServersQueue->reset_busy(); if(enable_sorting){ if(!sortedJobQueue->is_busy() && !sortedJobQueue->is_empty()){ JobEventPtr job = sortedJobQueue->dequeueJob(); workingServersQueue->enqueue(job,time); } } else{ if(!unsortedJobQueue->is_empty()){ JobEventPtr job = unsortedJobQueue->dequeue(); workingServersQueue->enqueue(job,time); } } } //_NOTE(3, (*eventList) << std::endl); } _NOTEL(1,"Finished simulation of " << time << " virtual seconds."); _LOGL(0,"Simulation results: " << std::endl << statistics); AccumulatorPtr latency = statistics.getAccumulator(AccumulatorStatistics::LATENCY); AccumulatorPtr total_energy = statistics.getAccumulator(AccumulatorStatistics::TOTAL_ENERGY); _LOGL(0, latency->getMean() << "$\\pm$" << latency->getCI(0.95) << " & " << total_energy->getMean() << "$\\pm$" << total_energy->getCI(0.95)); return 0; // Run the Unit tests #else _NOTEL(0,"Welcome to the unit tests."); test_accumulator(rand,statistics); test_working_servers(workingServersQueue,sortOrder,statistics); return 0; #endif // END APPLICATION CODE // } catch(std::exception& e) { std::cerr << "Unhandled Exception reached the top of main: " << e.what() << ", application will now exit" << std::endl; return ERROR_UNHANDLED_EXCEPTION; } return SUCCESS; } // main