int CondorQ::getAndFilterAds (const char *constraint, StringList &attrs, ClassAdList &list, bool useAllJobs) { if (useAllJobs) { char *attrs_str = attrs.print_to_delimed_string(); GetAllJobsByConstraint(constraint, attrs_str, list); free(attrs_str); } else { ClassAd *ad; if ((ad = GetNextJobByConstraint(constraint, 1)) != NULL) { list.Insert(ad); while((ad = GetNextJobByConstraint(constraint, 0)) != NULL) { list.Insert(ad); } } } // here GetNextJobByConstraint returned NULL. check if it was // because of the network or not. if qmgmt had a problem with // the net, then errno is set to ETIMEDOUT, and we should fail. if ( errno == ETIMEDOUT ) { return Q_SCHEDD_COMMUNICATION_ERROR; } return Q_OK; }
int Defrag::countMachines(char const *constraint,char const *constraint_source, MachineSet *machines) { ClassAdList startdAds; int count = 0; if( !queryMachines(constraint,constraint_source,startdAds) ) { return -1; } MachineSet my_machines; if( !machines ) { machines = &my_machines; } startdAds.Open(); ClassAd *startd_ad; while( (startd_ad=startdAds.Next()) ) { std::string machine; std::string name; startd_ad->LookupString(ATTR_NAME,name); slotNameToDaemonName(name,machine); if( machines->count(machine) ) { continue; } machines->insert(machine); count++; } startdAds.Close(); dprintf(D_FULLDEBUG,"Counted %d machines matching %s=%s\n", count,constraint_source,constraint); return count; }
// Read the history from the specified history file, or from all the history files. // There are multiple history files because we do rotation. static void readHistoryFromFiles(bool fileisuserlog, const char *JobHistoryFileName, const char* constraint, ExprTree *constraintExpr) { printHeader(); if (JobHistoryFileName) { if (fileisuserlog) { ClassAdList jobs; if ( ! userlog_to_classads(JobHistoryFileName, jobs, NULL, 0, constraint)) { fprintf(stderr, "Error: Can't open userlog %s\n", JobHistoryFileName); exit(1); } printJobAds(jobs); jobs.Clear(); } else { // If the user specified the name of the file to read, we read that file only. readHistoryFromFileEx(JobHistoryFileName, constraint, constraintExpr, backwards); } } else { // The user didn't specify the name of the file to read, so we read // the history file, and any backups (rotated versions). int numHistoryFiles; const char **historyFiles; historyFiles = findHistoryFiles("HISTORY", &numHistoryFiles); if (!historyFiles) { fprintf( stderr, "Error: No history file is defined\n"); fprintf(stderr, "\n"); print_wrapped_text("Extra Info: " "The variable HISTORY is not defined in " "your config file. If you want Condor to " "keep a history of past jobs, you must " "define HISTORY in your config file", stderr ); exit(1); } if (historyFiles && numHistoryFiles > 0) { int fileIndex; if (backwards) { // Reverse reading of history files array for(fileIndex = numHistoryFiles - 1; fileIndex >= 0; fileIndex--) { readHistoryFromFileEx(historyFiles[fileIndex], constraint, constraintExpr, backwards); } } else { for (fileIndex = 0; fileIndex < numHistoryFiles; fileIndex++) { readHistoryFromFileEx(historyFiles[fileIndex], constraint, constraintExpr, backwards); } } freeHistoryFilesList(historyFiles); } } return; }
void StatsD::mapDaemonIPs(ClassAdList &daemon_ads,CollectorList &collectors) { // The map of machines to IPs is used when directing ganglia to // associate specific metrics with specific hosts (host spoofing) m_daemon_ips.clear(); daemon_ads.Open(); ClassAd *daemon; while( (daemon=daemon_ads.Next()) ) { std::string machine,name,my_address; daemon->EvaluateAttrString(ATTR_MACHINE,machine); daemon->EvaluateAttrString(ATTR_MACHINE,name); daemon->EvaluateAttrString(ATTR_MY_ADDRESS,my_address); Sinful s(my_address.c_str()); if( !s.getHost() ) { continue; } std::string ip = s.getHost(); if( !machine.empty() ) { m_daemon_ips.insert( std::map< std::string,std::string >::value_type(machine,ip) ); } if( !name.empty() ) { m_daemon_ips.insert( std::map< std::string,std::string >::value_type(name,ip) ); } } daemon_ads.Close(); // Also add a mapping of collector hosts to IPs, and determine the // collector host to use as the default machine name for aggregate // metrics. m_default_aggregate_host = ""; DCCollector *collector=NULL; collectors.rewind(); while( (collectors.next(collector)) ) { char const *collector_host = collector->fullHostname(); char const *collector_addr = collector->addr(); if( collector_host && m_default_aggregate_host.empty() ) { m_default_aggregate_host = collector_host; } if( collector_host && collector_addr ) { Sinful s(collector_addr); if( s.getHost() ) { char const *ip = s.getHost(); m_daemon_ips.insert( std::map< std::string,std::string >::value_type(collector_host,ip) ); } } } }
bool Defrag::queryMachines(char const *constraint,char const *constraint_source,ClassAdList &startdAds) { CondorQuery startdQuery(STARTD_AD); validateExpr(constraint,constraint_source); startdQuery.addANDConstraint(constraint); CollectorList* collects = daemonCore->getCollectorList(); ASSERT( collects ); QueryResult result; result = collects->query(startdQuery,startdAds); if( result != Q_OK ) { dprintf(D_ALWAYS, "Couldn't fetch startd ads using constraint " "%s=%s: %s\n", constraint_source,constraint, getStrQueryResult(result)); return false; } dprintf(D_FULLDEBUG,"Got %d startd ads matching %s=%s\n", startdAds.MyLength(), constraint_source, constraint); return true; }
int CondorQ::fetchQueueFromDB (ClassAdList &list, char *&lastUpdate, const char *dbconn, CondorError* /*errstack*/) { #ifndef HAVE_EXT_POSTGRESQL (void) list; (void) lastUpdate; (void) dbconn; #else int result; JobQueueSnapshot *jqSnapshot; const char *constraint; ClassAd *ad; QuillErrCode rv; ExprTree *tree; jqSnapshot = new JobQueueSnapshot(dbconn); rv = jqSnapshot->startIterateAllClassAds(clusterarray, numclusters, procarray, numprocs, schedd, FALSE, scheddBirthdate, lastUpdate); if (rv == QUILL_FAILURE) { delete jqSnapshot; return Q_COMMUNICATION_ERROR; } else if (rv == JOB_QUEUE_EMPTY) { delete jqSnapshot; return Q_OK; } // make the query ad if ((result = query.makeQuery (tree)) != Q_OK) { delete jqSnapshot; return result; } constraint = ExprTreeToString(tree); delete tree; ad = getDBNextJobByConstraint(constraint, jqSnapshot); while (ad != (ClassAd *) 0) { ad->ChainCollapse(); list.Insert(ad); ad = getDBNextJobByConstraint(constraint, jqSnapshot); } delete jqSnapshot; #endif /* HAVE_EXT_POSTGRESQL */ return Q_OK; }
QueryResult CondorQuery:: filterAds (ClassAdList &in, ClassAdList &out) { ClassAd queryAd, *candidate; QueryResult result; // make the query ad result = getQueryAd (queryAd); if (result != Q_OK) return result; in.Open(); while( (candidate = (ClassAd *) in.Next()) ) { // if a match occurs if (IsAHalfMatch(&queryAd, candidate)) out.Insert (candidate); } in.Close (); return Q_OK; }
static void printJobAds(ClassAdList & jobs) { if(longformat && use_xml) { std::string out; AddClassAdXMLFileHeader(out); printf("%s\n", out.c_str()); } jobs.Open(); ClassAd *job; while (( job = jobs.Next())) { printJob(*job); } jobs.Close(); if(longformat && use_xml) { std::string out; AddClassAdXMLFileFooter(out); printf("%s\n", out.c_str()); } }
void StatsD::determineExecuteNodes(ClassAdList &daemon_ads) { std::set< std::string > submit_nodes; std::set< std::string > execute_nodes; std::set< std::string > cm_nodes; daemon_ads.Open(); ClassAd *daemon; while( (daemon=daemon_ads.Next()) ) { std::string machine,my_type; daemon->EvaluateAttrString(ATTR_MACHINE,machine); daemon->EvaluateAttrString(ATTR_MY_TYPE,my_type); if( strcasecmp(my_type.c_str(),"machine")==0 ) { execute_nodes.insert( std::set< std::string >::value_type(machine) ); } else if( strcasecmp(my_type.c_str(),"scheduler")==0 ) { submit_nodes.insert( std::set< std::string >::value_type(machine) ); } else if( strcasecmp(my_type.c_str(),"negotiator")==0 || strcasecmp(my_type.c_str(),"collector")==0 ) { cm_nodes.insert( std::set< std::string >::value_type(machine) ); } } daemon_ads.Close(); m_execute_only_nodes.clear(); for( std::set< std::string >::iterator itr = execute_nodes.begin(); itr != execute_nodes.end(); itr++ ) { if( !submit_nodes.count(*itr) && !cm_nodes.count(*itr) ) { m_execute_only_nodes.insert(*itr); } } if( !m_per_execute_node_metrics && m_execute_only_nodes.size()>0 ) { dprintf(D_FULLDEBUG,"Filtering out metrics for %d execute nodes because PER_EXECUTE_NODE_METRICS=False.\n", (int)m_execute_only_nodes.size()); } }
void Defrag::poll_cancel(MachineSet &cancelled_machines) { if (!m_cancel_requirements.size()) { return; } MachineSet draining_whole_machines; std::stringstream draining_whole_machines_ss; draining_whole_machines_ss << "(" << m_cancel_requirements << ") && (" << DRAINING_CONSTRAINT << ")"; int num_draining_whole_machines = countMachines(draining_whole_machines_ss.str().c_str(), "<DEFRAG_CANCEL_REQUIREMENTS>", &draining_whole_machines); if (num_draining_whole_machines) { dprintf(D_ALWAYS, "Of the whole machines, %d are in the draining state.\n", num_draining_whole_machines); } else { // Early exit: nothing to do. return; } ClassAdList startdAds; if (!queryMachines(DRAINING_CONSTRAINT, "DRAINING_CONSTRAINT <all draining slots>",startdAds)) { return; } startdAds.Shuffle(); startdAds.Sort(StartdSortFunc,&m_rank_ad); startdAds.Open(); unsigned int cancel_count = 0; ClassAd *startd_ad_ptr; while ( (startd_ad_ptr=startdAds.Next()) ) { if (!startd_ad_ptr) continue; ClassAd &startd_ad = *startd_ad_ptr; std::string machine; std::string name; startd_ad.LookupString(ATTR_NAME,name); slotNameToDaemonName(name,machine); if( !cancelled_machines.count(machine) && draining_whole_machines.count(machine) ) { cancel_drain(startd_ad); cancelled_machines.insert(machine); cancel_count ++; } } startdAds.Close(); dprintf(D_ALWAYS, "Cancelled draining of %u whole machines.\n", cancel_count); }
static bool read_classad_file(const char *filename, ClassAdList &classads, const char * constr) { bool success = false; FILE* file = safe_fopen_wrapper_follow(filename, "r"); if (file == NULL) { fprintf(stderr, "Can't open file of job ads: %s\n", filename); return false; } else { CondorClassAdFileParseHelper parse_helper("\n"); for (;;) { ClassAd* classad = new ClassAd(); int error; bool is_eof; int cAttrs = classad->InsertFromFile(file, is_eof, error, &parse_helper); bool include_classad = cAttrs > 0 && error >= 0; if (include_classad && constr) { classad::Value val; if (classad->EvaluateExpr(constr,val)) { if ( ! val.IsBooleanValueEquiv(include_classad)) { include_classad = false; } } } if (include_classad) { classads.Insert(classad); } else { delete classad; } if (is_eof) { success = true; break; } if (error < 0) { success = false; break; } } fclose(file); } return success; }
int LeaseManager::timerHandler_GetAds ( void ) { CondorQuery query( m_queryAdtypeNum ); if ( m_queryConstraints.length() ) { query.addANDConstraint( m_queryConstraints.c_str() ); } if ( m_enable_ad_debug ) { ClassAd qad; query.getQueryAd( qad ); dprintf( D_FULLDEBUG, "Query Ad:\n" ); dPrintAd( D_FULLDEBUG, qad ); } QueryResult result; ClassAdList ads; dprintf(D_ALWAYS, " Getting all resource ads ...\n" ); result = m_collectorList->query( query, ads ); if( result != Q_OK ) { dprintf(D_ALWAYS, "Couldn't fetch ads: %s\n", getStrQueryResult(result)); return false; } m_resources.StartExpire( ); dprintf(D_ALWAYS, " Processing %d ads ...\n", ads.MyLength() ); DebugTimerDprintf timer; int list_length = ads.MyLength(); ads.Open( ); ClassAd *ad; while( ( ad = ads.Next()) ) { // Give the ad to the collection ads.Remove( ad ); m_resources.AddResource( ad ); } ads.Close( ); timer.Log( "ProcessAds", list_length ); dprintf( D_ALWAYS, " Done processing %d ads; pruning\n", list_length); timer.Start( ); m_resources.PruneExpired( ); timer.Log( "PruneExpired" ); dprintf( D_ALWAYS, " Done pruning ads\n" ); return 0; }
void Rooster::poll() { dprintf(D_FULLDEBUG,"C**k-a-doodle-doo! (Time to look for machines to wake up.)\n"); ClassAdList startdAds; CondorQuery unhibernateQuery(STARTD_AD); ExprTree *requirements = NULL; if( ParseClassAdRvalExpr( m_unhibernate_constraint.Value(), requirements )!=0 || requirements==NULL ) { EXCEPT("Invalid expression for ROOSTER_UNHIBERNATE: %s\n", m_unhibernate_constraint.Value()); } unhibernateQuery.addANDConstraint(m_unhibernate_constraint.Value()); CollectorList* collects = daemonCore->getCollectorList(); ASSERT( collects ); QueryResult result; result = collects->query(unhibernateQuery,startdAds); if( result != Q_OK ) { dprintf(D_ALWAYS, "Couldn't fetch startd ads using constraint " "ROOSTER_UNHIBERNATE=%s: %s\n", m_unhibernate_constraint.Value(), getStrQueryResult(result)); return; } dprintf(D_FULLDEBUG,"Got %d startd ads matching ROOSTER_UNHIBERNATE=%s\n", startdAds.MyLength(), m_unhibernate_constraint.Value()); startdAds.Sort(StartdSortFunc,&m_rank_ad); startdAds.Open(); int num_woken = 0; ClassAd *startd_ad; HashTable<MyString,bool> machines_done(MyStringHash); while( (startd_ad=startdAds.Next()) ) { MyString machine; MyString name; startd_ad->LookupString(ATTR_MACHINE,machine); startd_ad->LookupString(ATTR_NAME,name); if( machines_done.exists(machine)==0 ) { dprintf(D_FULLDEBUG, "Skipping %s: already attempted to wake up %s in this cycle.\n", name.Value(),machine.Value()); continue; } // in case the unhibernate expression is time-sensitive, // re-evaluate it now to make sure it still passes if( !EvalBool(startd_ad,requirements) ) { dprintf(D_ALWAYS, "Skipping %s: ROOSTER_UNHIBERNATE is no longer true.\n", name.Value()); continue; } if( wakeUp(startd_ad) ) { machines_done.insert(machine,true); if( ++num_woken >= m_max_unhibernate && m_max_unhibernate > 0 ) { dprintf(D_ALWAYS, "Reached ROOSTER_MAX_UNHIBERNATE=%d in this cycle.\n", m_max_unhibernate); break; } } } startdAds.Close(); delete requirements; requirements = NULL; if( startdAds.MyLength() ) { dprintf(D_FULLDEBUG,"Done sending wakeup calls.\n"); } }
int main(int argc, char* argv[]) { Collectors = NULL; HistorySnapshot *historySnapshot; SQLQuery queryhor; SQLQuery queryver; QuillErrCode st; void **parameters; char *dbconn=NULL; bool readfromfile = false,remotequill=false; char *dbIpAddr=NULL, *dbName=NULL,*queryPassword=NULL,*quillName=NULL; AttrList *ad=0; int flag = 1; MyString tmp; int i; parameters = (void **) malloc(NUM_PARAMETERS * sizeof(void *)); myDistro->Init( argc, argv ); queryhor.setQuery(HISTORY_ALL_HOR, NULL); queryver.setQuery(HISTORY_ALL_VER, NULL); longformat=TRUE; for(i=1; i<argc; i++) { if(strcmp(argv[i], "-name")==0) { i++; if (argc <= i) { fprintf( stderr, "Error: Argument -name requires the name of a quilld as a parameter\n" ); exit(1); } if( !(quillName = get_daemon_name(argv[i])) ) { fprintf( stderr, "Error: unknown host %s\n", get_host_part(argv[i]) ); printf("\n"); print_wrapped_text("Extra Info: The name given with the -name " "should be the name of a condor_quilld process. " "Normally it is either a hostname, or " "\"name@hostname\". " "In either case, the hostname should be the " "Internet host name, but it appears that it " "wasn't.", stderr); exit(1); } tmp.sprintf ("%s == \"%s\"", ATTR_NAME, quillName); quillQuery.addORConstraint (tmp.Value()); tmp.sprintf ("%s == \"%s\"", ATTR_SCHEDD_NAME, quillName); quillQuery.addORConstraint (tmp.Value()); remotequill = true; readfromfile = false; } else if (strcmp(argv[i],"-help")==0) { Usage(argv[0],0); } } if (i<argc) Usage(argv[0]); config(); /* This call must happen AFTER config() is called */ if (checkDBconfig() == true && !readfromfile) { readfromfile = false; } else { /* couldn't get DB configuration, so bail out */ printf("Error: Cannot use DB to get history information\n"); exit(1); } if(readfromfile == false) { if(remotequill) { if (Collectors == NULL) { Collectors = CollectorList::create(); if(Collectors == NULL ) { printf("Error: Unable to get list of known collectors\n"); exit(1); } } result = Collectors->query ( quillQuery, quillList ); if(result != Q_OK) { printf("Fatal Error querying collectors\n"); exit(1); } if(quillList.MyLength() == 0) { printf("Error: Unknown quill server %s\n", quillName); exit(1); } quillList.Open(); while ((ad = quillList.Next())) { // get the address of the database dbIpAddr = dbName = queryPassword = NULL; if (!ad->LookupString(ATTR_QUILL_DB_IP_ADDR, &dbIpAddr) || !ad->LookupString(ATTR_QUILL_DB_NAME, &dbName) || !ad->LookupString(ATTR_QUILL_DB_QUERY_PASSWORD, &queryPassword) || (ad->LookupBool(ATTR_QUILL_IS_REMOTELY_QUERYABLE,flag) && !flag)) { printf("Error: The quill daemon \"%s\" is not set up " "for database queries\n", quillName); exit(1); } } } dbconn = getDBConnStr(quillName,dbIpAddr,dbName,queryPassword); historySnapshot = new HistorySnapshot(dbconn); //printf ("\n\n-- Quill: %s : %s : %s\n", quillName, dbIpAddr, dbName); st = historySnapshot->sendQuery(&queryhor, &queryver, longformat, true); //if there's a failure here and if we're not posing a query on a //remote quill daemon, we should instead query the local file if(st == QUILL_FAILURE) { printf( "-- Database at %s not reachable\n", dbIpAddr); } // query history table if (historySnapshot->isHistoryEmpty()) { printf("No historical jobs in the database\n"); } historySnapshot->release(); delete(historySnapshot); } if(parameters) free(parameters); if(dbIpAddr) free(dbIpAddr); if(dbName) free(dbName); if(queryPassword) free(queryPassword); if(quillName) free(quillName); if(dbconn) free(dbconn); return 0; }
bool Triggerd::PerformQueries() { ClassAdList result; CondorError errstack; QueryResult status; Trigger* trig = NULL; CondorQuery* query; bool ret_val = true; std::map<uint32_t,Trigger*>::iterator iter; ClassAd* ad = NULL; std::string eventText; char* token = NULL; std::string triggerText; char* queryString = NULL; ExprTree* attr = NULL; std::list<std::string> missing_nodes; size_t pos; size_t prev_pos; bool bad_trigger = false; const char* token_str = NULL; if (0 < triggers.size()) { dprintf(D_FULLDEBUG, "Triggerd: Evaluating %d triggers\n", (int)triggers.size()); query = new CondorQuery(ANY_AD); for (iter = triggers.begin(); iter != triggers.end(); iter++) { // Clear any pre-exhisting custom contraints and add the constraint // for this trigger trig = iter->second; query->clearORCustomConstraints(); query->clearANDCustomConstraints(); queryString = strdup(trig->GetQuery().c_str()); ReplaceAllChars(queryString, '\'', '"'); query->addANDConstraint(queryString); free(queryString); // Perform the query and check the result if (NULL != query_collector) { status = query->fetchAds(result, query_collector->addr(), &errstack); } else { status = collectors->query(*query, result, &errstack); } if (Q_OK != status) { // Problem with the query if (Q_COMMUNICATION_ERROR == status) { dprintf(D_ALWAYS, "Triggerd Error: Error contacting the collecter - %s\n", errstack.getFullText(true).c_str()); if (CEDAR_ERR_CONNECT_FAILED == errstack.code(0)) { dprintf(D_ALWAYS, "Triggerd Error: Couldn't contact the collector on the central manager\n"); } } else { dprintf(D_ALWAYS, "Triggerd Error: Could not retrieve ads - %s\n", getStrQueryResult(status)); } ret_val = false; break; } else { dprintf(D_FULLDEBUG, "Query successful. Parsing results\n"); // Query was successful, so parse the results result.Open(); while ((ad = result.Next())) { if (true == bad_trigger) { // Avoid processing a bad trigger multiple times. Remove // all result ads and reset the flag dprintf(D_FULLDEBUG, "Cleaning up after a bad trigger\n"); result.Delete(ad); while ((ad = result.Next())) { result.Delete(ad); } bad_trigger = false; break; } eventText = ""; triggerText = trig->GetText(); dprintf(D_FULLDEBUG, "Parsing trigger text '%s'\n", triggerText.c_str()); prev_pos = pos = 0; while (prev_pos < triggerText.length()) { pos = triggerText.find("$(", prev_pos, 2); if (std::string::npos == pos) { // Didn't find the start of a varible, so append the // remaining string dprintf(D_FULLDEBUG, "Adding text string to event text\n"); eventText += triggerText.substr(prev_pos, std::string::npos); prev_pos = triggerText.length(); } else { // Found a variable for substitution. Need to add // text before it to the string, grab the variable // to substitute for, and put its value in the text eventText += triggerText.substr(prev_pos, pos - prev_pos); dprintf(D_FULLDEBUG, "Adding text string prior to variable substitution to event text\n"); // Increment the position by 2 to skip the $( prev_pos = pos + 2; pos = triggerText.find(")", prev_pos, 1); if (std::string::npos == pos) { // Uh-oh. We have a start of a variable substitution // but no closing marker. dprintf(D_FULLDEBUG, "Error: Failed to find closing varable substitution marker ')'. Aborting processing of the trigger\n"); bad_trigger = true; break; } else { token_str = triggerText.substr(prev_pos, pos-prev_pos).c_str(); token = RemoveWS(token_str); dprintf(D_FULLDEBUG, "token: '%s'\n", token); if (NULL == token) { dprintf(D_ALWAYS, "Removing whitespace from %s produced unusable name. Aborting processing of the trigger\n", token_str); bad_trigger = true; break; } attr = ad->LookupExpr(token); if (NULL == attr) { // The token isn't found in the classad, so treat it // like a string dprintf(D_FULLDEBUG, "Adding text string to event text\n"); eventText += token; } else { dprintf(D_FULLDEBUG, "Adding classad value to event text\n"); eventText += ExprTreeToString(attr); } if (NULL != token) { free(token); token = NULL; } ++pos; } prev_pos = pos; } } // Remove the trailing space std::string::size_type notwhite = eventText.find_last_not_of(" "); eventText.erase(notwhite+1); // Send the event if (false == bad_trigger) { EventCondorTriggerNotify event(eventText, time(NULL)); singleton->getInstance()->raiseEvent(event); dprintf(D_FULLDEBUG, "Triggerd: Raised event with text '%s'\n", eventText.c_str()); } result.Delete(ad); } bad_trigger = false; result.Close(); } } delete query; } else { dprintf(D_FULLDEBUG, "Triggerd: No triggers to evaluate\n"); } // Look for absent nodes (nodes expected to be in the pool but aren't) if (NULL != console) { missing_nodes = console->findAbsentNodes(); if (0 < missing_nodes.size()) { for (std::list<std::string>::iterator node = missing_nodes.begin(); node != missing_nodes.end(); ++ node) { eventText = node->c_str(); eventText += " is missing from the pool"; EventCondorTriggerNotify event(eventText, time(NULL)); singleton->getInstance()->raiseEvent(event); dprintf(D_FULLDEBUG, "Triggerd: Raised event with text '%s'\n", eventText.c_str()); } } } return ret_val; }
// fetch all ads from the collector that satisfy the constraints QueryResult CondorQuery:: fetchAds (ClassAdList &adList, const char *poolName, CondorError* errstack) { Sock* sock; int more; QueryResult result; ClassAd queryAd(extraAttrs), *ad; if ( !poolName ) { return Q_NO_COLLECTOR_HOST; } // contact collector Daemon my_collector( DT_COLLECTOR, poolName, NULL ); if( !my_collector.locate() ) { // We were passed a bogus poolName, abort gracefully return Q_NO_COLLECTOR_HOST; } // make the query ad result = getQueryAd (queryAd); if (result != Q_OK) return result; if( IsDebugLevel( D_HOSTNAME ) ) { dprintf( D_HOSTNAME, "Querying collector %s (%s) with classad:\n", my_collector.addr(), my_collector.fullHostname() ); queryAd.dPrint( D_HOSTNAME ); dprintf( D_HOSTNAME, " --- End of Query ClassAd ---\n" ); } int mytimeout = param_integer ("QUERY_TIMEOUT",60); if (!(sock = my_collector.startCommand(command, Stream::reli_sock, mytimeout, errstack)) || !queryAd.put (*sock) || !sock->end_of_message()) { if (sock) { delete sock; } return Q_COMMUNICATION_ERROR; } // get result sock->decode (); more = 1; while (more) { if (!sock->code (more)) { sock->end_of_message(); delete sock; return Q_COMMUNICATION_ERROR; } if (more) { ad = new ClassAd; if( !ad->initFromStream(*sock) ) { sock->end_of_message(); delete ad; delete sock; return Q_COMMUNICATION_ERROR; } adList.Insert (ad); } } sock->end_of_message(); // finalize sock->close(); delete sock; return (Q_OK); }
int main (int argc, char *argv[]) { #if !defined(WIN32) install_sig_handler(SIGPIPE, (SIG_HANDLER)SIG_IGN ); #endif // initialize to read from config file myDistro->Init( argc, argv ); myName = argv[0]; config(); dprintf_config_tool_on_error(0); // The arguments take two passes to process --- the first pass // figures out the mode, after which we can instantiate the required // query object. We add implied constraints from the command line in // the second pass. firstPass (argc, argv); // if the mode has not been set, it is STARTD_NORMAL if (mode == MODE_NOTSET) { setMode (MODE_STARTD_NORMAL, 0, DEFAULT); } // instantiate query object if (!(query = new CondorQuery (type))) { dprintf_WriteOnErrorBuffer(stderr, true); fprintf (stderr, "Error: Out of memory\n"); exit (1); } // if a first-pass setMode set a mode_constraint, apply it now to the query object if (mode_constraint && ! explicit_format) { query->addANDConstraint(mode_constraint); } // set pretty print style implied by the type of entity being queried // but do it with default priority, so that explicitly requested options // can override it switch (type) { #ifdef HAVE_EXT_POSTGRESQL case QUILL_AD: setPPstyle(PP_QUILL_NORMAL, 0, DEFAULT); break; #endif /* HAVE_EXT_POSTGRESQL */ case DEFRAG_AD: setPPstyle(PP_GENERIC_NORMAL, 0, DEFAULT); break; case STARTD_AD: setPPstyle(PP_STARTD_NORMAL, 0, DEFAULT); break; case SCHEDD_AD: setPPstyle(PP_SCHEDD_NORMAL, 0, DEFAULT); break; case MASTER_AD: setPPstyle(PP_MASTER_NORMAL, 0, DEFAULT); break; case CKPT_SRVR_AD: setPPstyle(PP_CKPT_SRVR_NORMAL, 0, DEFAULT); break; case COLLECTOR_AD: setPPstyle(PP_COLLECTOR_NORMAL, 0, DEFAULT); break; case STORAGE_AD: setPPstyle(PP_STORAGE_NORMAL, 0, DEFAULT); break; case NEGOTIATOR_AD: setPPstyle(PP_NEGOTIATOR_NORMAL, 0, DEFAULT); break; case GRID_AD: setPPstyle(PP_GRID_NORMAL, 0, DEFAULT); break; case GENERIC_AD: setPPstyle(PP_GENERIC, 0, DEFAULT); break; case ANY_AD: setPPstyle(PP_ANY_NORMAL, 0, DEFAULT); break; default: setPPstyle(PP_VERBOSE, 0, DEFAULT); } // set the constraints implied by the mode switch (mode) { #ifdef HAVE_EXT_POSTGRESQL case MODE_QUILL_NORMAL: #endif /* HAVE_EXT_POSTGRESQL */ case MODE_DEFRAG_NORMAL: case MODE_STARTD_NORMAL: case MODE_MASTER_NORMAL: case MODE_CKPT_SRVR_NORMAL: case MODE_SCHEDD_NORMAL: case MODE_SCHEDD_SUBMITTORS: case MODE_COLLECTOR_NORMAL: case MODE_NEGOTIATOR_NORMAL: case MODE_STORAGE_NORMAL: case MODE_GENERIC_NORMAL: case MODE_ANY_NORMAL: case MODE_GRID_NORMAL: case MODE_HAD_NORMAL: break; case MODE_OTHER: // tell the query object what the type we're querying is query->setGenericQueryType(genericType); free(genericType); genericType = NULL; break; case MODE_STARTD_AVAIL: // For now, -avail shows you machines avail to anyone. sprintf (buffer, "%s == \"%s\"", ATTR_STATE, state_to_string(unclaimed_state)); if (diagnose) { printf ("Adding constraint [%s]\n", buffer); } query->addORConstraint (buffer); break; case MODE_STARTD_RUN: sprintf (buffer, "%s == \"%s\"", ATTR_STATE, state_to_string(claimed_state)); if (diagnose) { printf ("Adding constraint [%s]\n", buffer); } query->addORConstraint (buffer); break; case MODE_STARTD_COD: sprintf (buffer, "%s > 0", ATTR_NUM_COD_CLAIMS ); if (diagnose) { printf ("Adding constraint [%s]\n", buffer); } query->addORConstraint (buffer); break; default: break; } if(javaMode) { sprintf( buffer, "%s == TRUE", ATTR_HAS_JAVA ); if (diagnose) { printf ("Adding constraint [%s]\n", buffer); } query->addANDConstraint (buffer); projList.AppendArg(ATTR_HAS_JAVA); projList.AppendArg(ATTR_JAVA_MFLOPS); projList.AppendArg(ATTR_JAVA_VENDOR); projList.AppendArg(ATTR_JAVA_VERSION); } if(offlineMode) { query->addANDConstraint( "size( OfflineUniverses ) != 0" ); projList.AppendArg( "OfflineUniverses" ); // // Since we can't add a regex to a projection, explicitly list all // the attributes we know about. // projList.AppendArg( "HasVM" ); projList.AppendArg( "VMOfflineReason" ); projList.AppendArg( "VMOfflineTime" ); } if(absentMode) { sprintf( buffer, "%s == TRUE", ATTR_ABSENT ); if (diagnose) { printf( "Adding constraint %s\n", buffer ); } query->addANDConstraint( buffer ); projList.AppendArg( ATTR_ABSENT ); projList.AppendArg( ATTR_LAST_HEARD_FROM ); projList.AppendArg( ATTR_CLASSAD_LIFETIME ); } if(vmMode) { sprintf( buffer, "%s == TRUE", ATTR_HAS_VM); if (diagnose) { printf ("Adding constraint [%s]\n", buffer); } query->addANDConstraint (buffer); projList.AppendArg(ATTR_VM_TYPE); projList.AppendArg(ATTR_VM_MEMORY); projList.AppendArg(ATTR_VM_NETWORKING); projList.AppendArg(ATTR_VM_NETWORKING_TYPES); projList.AppendArg(ATTR_VM_HARDWARE_VT); projList.AppendArg(ATTR_VM_AVAIL_NUM); projList.AppendArg(ATTR_VM_ALL_GUEST_MACS); projList.AppendArg(ATTR_VM_ALL_GUEST_IPS); projList.AppendArg(ATTR_VM_GUEST_MAC); projList.AppendArg(ATTR_VM_GUEST_IP); } // second pass: add regular parameters and constraints if (diagnose) { printf ("----------\n"); } secondPass (argc, argv); // initialize the totals object if (ppStyle == PP_CUSTOM && using_print_format) { if (pmHeadFoot & HF_NOSUMMARY) ppTotalStyle = PP_CUSTOM; } else { ppTotalStyle = ppStyle; } TrackTotals totals(ppTotalStyle); // fetch the query QueryResult q; if ((mode == MODE_STARTD_NORMAL) && (ppStyle == PP_STARTD_NORMAL)) { projList.AppendArg("Name"); projList.AppendArg("Machine"); projList.AppendArg("Opsys"); projList.AppendArg("Arch"); projList.AppendArg("State"); projList.AppendArg("Activity"); projList.AppendArg("LoadAvg"); projList.AppendArg("Memory"); projList.AppendArg("ActvtyTime"); projList.AppendArg("MyCurrentTime"); projList.AppendArg("EnteredCurrentActivity"); } else if( ppStyle == PP_VERBOSE ) { // Remove everything from the projection list if we're displaying // the "long form" of the ads. projList.Clear(); // but if -attributes was supplied, show only those attributes if ( ! dashAttributes.isEmpty()) { const char * s; dashAttributes.rewind(); while ((s = dashAttributes.next())) { projList.AppendArg(s); } } } if( projList.Count() > 0 ) { char **attr_list = projList.GetStringArray(); query->setDesiredAttrs(attr_list); deleteStringArray(attr_list); } // if diagnose was requested, just print the query ad if (diagnose) { ClassAd queryAd; // print diagnostic information about inferred internal state setMode ((Mode) 0, 0, NULL); setType (NULL, 0, NULL); setPPstyle ((ppOption) 0, 0, DEFAULT); printf ("----------\n"); q = query->getQueryAd (queryAd); fPrintAd (stdout, queryAd); printf ("----------\n"); fprintf (stderr, "Result of making query ad was: %d\n", q); exit (1); } // Address (host:port) is taken from requested pool, if given. char* addr = (NULL != pool) ? pool->addr() : NULL; Daemon* requested_daemon = pool; // If we're in "direct" mode, then we attempt to locate the daemon // associated with the requested subsystem (here encoded by value of mode) // In this case the host:port of pool (if given) denotes which // pool is being consulted if( direct ) { Daemon *d = NULL; switch( mode ) { case MODE_MASTER_NORMAL: d = new Daemon( DT_MASTER, direct, addr ); break; case MODE_STARTD_NORMAL: case MODE_STARTD_AVAIL: case MODE_STARTD_RUN: case MODE_STARTD_COD: d = new Daemon( DT_STARTD, direct, addr ); break; #ifdef HAVE_EXT_POSTGRESQL case MODE_QUILL_NORMAL: d = new Daemon( DT_QUILL, direct, addr ); break; #endif /* HAVE_EXT_POSTGRESQL */ case MODE_SCHEDD_NORMAL: case MODE_SCHEDD_SUBMITTORS: d = new Daemon( DT_SCHEDD, direct, addr ); break; case MODE_NEGOTIATOR_NORMAL: d = new Daemon( DT_NEGOTIATOR, direct, addr ); break; case MODE_CKPT_SRVR_NORMAL: case MODE_COLLECTOR_NORMAL: case MODE_LICENSE_NORMAL: case MODE_STORAGE_NORMAL: case MODE_GENERIC_NORMAL: case MODE_ANY_NORMAL: case MODE_OTHER: case MODE_GRID_NORMAL: case MODE_HAD_NORMAL: // These have to go to the collector, anyway. break; default: fprintf( stderr, "Error: Illegal mode %d\n", mode ); exit( 1 ); break; } // Here is where we actually override 'addr', if we can obtain // address of the requested daemon/subsys. If it can't be // located, then fail with error msg. // 'd' will be null (unset) if mode is one of above that must go to // collector (MODE_ANY_NORMAL, MODE_COLLECTOR_NORMAL, etc) if (NULL != d) { if( d->locate() ) { addr = d->addr(); requested_daemon = d; } else { const char* id = d->idStr(); if (NULL == id) id = d->name(); dprintf_WriteOnErrorBuffer(stderr, true); if (NULL == id) id = "daemon"; fprintf(stderr, "Error: Failed to locate %s\n", id); fprintf(stderr, "%s\n", d->error()); exit( 1 ); } } } ClassAdList result; CondorError errstack; if (NULL != ads_file) { MyString req; // query requirements q = query->getRequirements(req); const char * constraint = req.empty() ? NULL : req.c_str(); if (read_classad_file(ads_file, result, constraint)) { q = Q_OK; } } else if (NULL != addr) { // this case executes if pool was provided, or if in "direct" mode with // subsystem that corresponds to a daemon (above). // Here 'addr' represents either the host:port of requested pool, or // alternatively the host:port of daemon associated with requested subsystem (direct mode) q = query->fetchAds (result, addr, &errstack); } else { // otherwise obtain list of collectors and submit query that way CollectorList * collectors = CollectorList::create(); q = collectors->query (*query, result, &errstack); delete collectors; } // if any error was encountered during the query, report it and exit if (Q_OK != q) { dprintf_WriteOnErrorBuffer(stderr, true); // we can always provide these messages: fprintf( stderr, "Error: %s\n", getStrQueryResult(q) ); fprintf( stderr, "%s\n", errstack.getFullText(true).c_str() ); if ((NULL != requested_daemon) && ((Q_NO_COLLECTOR_HOST == q) || (requested_daemon->type() == DT_COLLECTOR))) { // Specific long message if connection to collector failed. const char* fullhost = requested_daemon->fullHostname(); if (NULL == fullhost) fullhost = "<unknown_host>"; const char* daddr = requested_daemon->addr(); if (NULL == daddr) daddr = "<unknown>"; char info[1000]; sprintf(info, "%s (%s)", fullhost, daddr); printNoCollectorContact( stderr, info, !expert ); } else if ((NULL != requested_daemon) && (Q_COMMUNICATION_ERROR == q)) { // more helpful message for failure to connect to some daemon/subsys const char* id = requested_daemon->idStr(); if (NULL == id) id = requested_daemon->name(); if (NULL == id) id = "daemon"; const char* daddr = requested_daemon->addr(); if (NULL == daddr) daddr = "<unknown>"; fprintf(stderr, "Error: Failed to contact %s at %s\n", id, daddr); } // fail exit (1); } if (noSort) { // do nothing } else if (sortSpecs.empty()) { // default classad sorting result.Sort((SortFunctionType)lessThanFunc); } else { // User requested custom sorting expressions: // insert attributes related to custom sorting result.Open(); while (ClassAd* ad = result.Next()) { for (vector<SortSpec>::iterator ss(sortSpecs.begin()); ss != sortSpecs.end(); ++ss) { ss->expr->SetParentScope(ad); classad::Value v; ss->expr->Evaluate(v); stringstream vs; // This will properly render all supported value types, // including undefined and error, although current semantic // pre-filters classads where sort expressions are undef/err: vs << ((v.IsStringValue())?"\"":"") << v << ((v.IsStringValue())?"\"":""); ad->AssignExpr(ss->keyAttr.c_str(), vs.str().c_str()); // Save the full expr in case user wants to examine on output: ad->AssignExpr(ss->keyExprAttr.c_str(), ss->arg.c_str()); } } result.Open(); result.Sort((SortFunctionType)customLessThanFunc); } // output result prettyPrint (result, &totals); delete query; return 0; }
int main( int argc, char *argv[] ) { const char *filename=0; char *pool=0; int command=-1; int i; bool use_tcp = false; bool with_ack = false; bool allow_multiple = false; param_functions *p_funcs = NULL; myDistro->Init( argc, argv ); config(); p_funcs = get_param_functions(); for( i=1; i<argc; i++ ) { if(!strcmp(argv[i],"-help")) { usage(argv[0]); exit(0); } else if(!strcmp(argv[i],"-pool")) { i++; if(!argv[i]) { fprintf(stderr,"-pool requires an argument.\n\n"); usage(argv[0]); exit(1); } pool = argv[i]; } else if(!strncmp(argv[i],"-tcp",strlen(argv[i]))) { use_tcp = true; } else if(!strncmp(argv[i],"-multiple",strlen(argv[i]))) { // We don't set allow_multiple=true by default, because // existing users (e.g. glideinWMS) have stray blank lines // in the input file. allow_multiple = true; } else if(!strcmp(argv[i],"-version")) { version(); exit(0); } else if(!strcmp(argv[i],"-debug")) { // dprintf to console Termlog = 1; p_funcs = get_param_functions(); dprintf_config ("TOOL", p_funcs); } else if(argv[i][0]!='-' || !strcmp(argv[i],"-")) { if(command==-1) { command = getCollectorCommandNum(argv[i]); if(command==-1) { fprintf(stderr,"Unknown command name %s\n\n",argv[i]); usage(argv[0]); exit(1); } } else if(!filename) { filename = argv[i]; } else { fprintf(stderr,"Extra argument: %s\n\n",argv[i]); usage(argv[0]); exit(1); } } else { fprintf(stderr,"Unknown argument: %s\n\n",argv[i]); usage(argv[0]); exit(1); } } FILE *file; ClassAdList ads; Daemon *collector; Sock *sock; switch( command ) { case UPDATE_STARTD_AD_WITH_ACK: with_ack = true; break; } if( with_ack ) { use_tcp = true; } if(!filename || !strcmp(filename,"-")) { file = stdin; filename = "(stdin)"; } else { file = safe_fopen_wrapper_follow(filename,"r"); } if(!file) { fprintf(stderr,"couldn't open %s: %s\n",filename,strerror(errno)); return 1; } while(!feof(file)) { int eof=0,error=0,empty=0; char const *delim = "\n"; if( !allow_multiple ) { delim = "***"; } ClassAd *ad = new ClassAd(file,const_cast<char *>(delim),eof,error,empty); if(error) { fprintf(stderr,"couldn't parse ClassAd in %s\n",filename); delete ad; return 1; } if( empty ) { delete ad; break; } if( !allow_multiple && ads.Length() > 0 ) { fprintf(stderr,"ERROR: failed to parse '%s' as a ClassAd attribute\n",delim); delete ad; return 1; } ads.Insert(ad); } if(ads.Length() == 0) { fprintf(stderr,"%s is empty\n",filename); return 1; } CollectorList * collectors; if ( pool ) { collector = new Daemon( DT_COLLECTOR, pool, 0 ); collectors = new CollectorList(); collectors->append (collector); } else { collectors = CollectorList::create(); } bool had_error = false; collectors->rewind(); while (collectors->next(collector)) { dprintf(D_FULLDEBUG,"locating collector %s...\n", collector->name()); if(!collector->locate()) { fprintf(stderr,"couldn't locate collector: %s\n",collector->error()); had_error = true; continue; } dprintf(D_FULLDEBUG,"collector is %s located at %s\n", collector->hostname(),collector->addr()); sock = NULL; ClassAd *ad; int success_count = 0; int failure_count = 0; ads.Rewind(); while( (ad=ads.Next()) ) { // If there's no "MyAddress", generate one.. if( !ad->Lookup( ATTR_MY_ADDRESS ) ) { MyString tmp; tmp.formatstr( "<%s:0>", my_ip_string() ); ad->Assign( ATTR_MY_ADDRESS, tmp.Value() ); } if ( use_tcp ) { if( !sock ) { sock = collector->startCommand(command,Stream::reli_sock,20); } else { // Use existing connection. sock->encode(); sock->put(command); } } else { // We must open a new UDP socket each time. delete sock; sock = collector->startCommand(command,Stream::safe_sock,20); } int result = 0; if ( sock ) { result += ad->put( *sock ); result += sock->end_of_message(); } if ( result != 2 ) { fprintf(stderr,"failed to send classad to %s\n",collector->addr()); had_error = true; failure_count++; delete sock; sock = NULL; continue; } if( with_ack ) { sock->decode(); int ok = 0; if( !sock->get(ok) || !sock->end_of_message() ) { fprintf(stderr,"failed to get ack from %s\n",collector->addr()); had_error = true; failure_count++; delete sock; sock = NULL; continue; } // ack protocol does not allow for multiple updates, // so close the socket now delete sock; sock = NULL; } success_count++; } if( sock ) { CondorVersionInfo const *ver = sock->get_peer_version(); if( !ver || ver->built_since_version(7,7,3) ) { // graceful hangup so the collector knows we are done sock->encode(); command = DC_NOP; sock->put(command); sock->end_of_message(); } delete sock; sock = NULL; } printf("Sent %d of %d ad%s to %s.\n", success_count, success_count + failure_count, success_count+failure_count == 1 ? "" : "s", collector->name()); } delete collectors; return (had_error)?1:0; }
void doContactSchedd() { if (command_queue.IsEmpty()) { daemonCore->Reset_Timer( contactScheddTid, contact_schedd_interval ); // Come back in a min return; } dprintf(D_FULLDEBUG,"in doContactSchedd\n"); SchedDRequest * current_command = NULL; int error=FALSE; std::string error_msg; CondorError errstack; bool do_reschedule = false; int failure_line_num = 0; int failure_errno = 0; // Try connecting to schedd DCSchedd dc_schedd ( ScheddAddr, ScheddPool ); if (dc_schedd.error() || !dc_schedd.locate()) { sprintf( error_msg, "Error locating schedd %s", ScheddAddr ); dprintf( D_ALWAYS, "%s\n", error_msg.c_str() ); // If you can't connect return "Failure" on every job request command_queue.Rewind(); while (command_queue.Next(current_command)) { if (current_command->status != SchedDRequest::SDCS_NEW) continue; if (current_command->command == SchedDRequest::SDC_STATUS_CONSTRAINED) { const char * result[] = { GAHP_RESULT_FAILURE, error_msg.c_str(), "0"}; enqueue_result (current_command->request_id, result, 3); } else if (current_command->command == SchedDRequest::SDC_SUBMIT_JOB) { const char * result[] = { GAHP_RESULT_FAILURE, NULL, error_msg.c_str() }; enqueue_result (current_command->request_id, result, 3); } else if (current_command->command == SchedDRequest::SDC_UPDATE_LEASE) { const char * result[] = { GAHP_RESULT_FAILURE, error_msg.c_str(), NULL }; enqueue_result (current_command->request_id, result, 3); } else { const char * result[] = { GAHP_RESULT_FAILURE, error_msg.c_str() }; enqueue_result (current_command->request_id, result, 2); } current_command->status = SchedDRequest::SDCS_COMPLETED; } } SchedDRequest::schedd_command_type commands [] = { SchedDRequest::SDC_REMOVE_JOB, SchedDRequest::SDC_HOLD_JOB, SchedDRequest::SDC_RELEASE_JOB }; const char * command_titles [] = { "REMOVE_JOB", "HOLD_JOB", "RELEASE_JOB" }; // REMOVE // HOLD // RELEASE int i=0; while (i<3) { StringList id_list; SimpleList <SchedDRequest*> this_batch; SchedDRequest::schedd_command_type this_command = commands[i]; const char * this_action = command_titles[i]; const char * this_reason = NULL; dprintf (D_FULLDEBUG, "Processing %s requests\n", this_action); error = FALSE; // Create a batch of commands with the same command type AND the same reason command_queue.Rewind(); while (command_queue.Next(current_command)) { if (current_command->status != SchedDRequest::SDCS_NEW) continue; if (current_command->command != this_command) continue; if ((this_reason != NULL) && (strcmp (current_command->reason, this_reason) != 0)) continue; if (this_reason == NULL) this_reason = current_command->reason; char job_id_buff[30]; sprintf (job_id_buff, "%d.%d", current_command->cluster_id, current_command->proc_id); id_list.append (job_id_buff); this_batch.Append (current_command); } // If we haven't found any.... if (id_list.isEmpty()) { i++; continue; // ... then try the next command } // Perform the appropriate command on the current batch ClassAd * result_ad= NULL; if (this_command == SchedDRequest::SDC_REMOVE_JOB) { errstack.clear(); result_ad= dc_schedd.removeJobs ( &id_list, this_reason, &errstack); } else if (this_command == SchedDRequest::SDC_HOLD_JOB) { errstack.clear(); result_ad= dc_schedd.holdJobs ( &id_list, this_reason, NULL, &errstack); } else if (this_command == SchedDRequest::SDC_RELEASE_JOB) { errstack.clear(); result_ad= dc_schedd.releaseJobs ( &id_list, this_reason, &errstack); } else { EXCEPT( "Unexpected command type %d in doContactSchedd", this_command ); } // Analyze the result ad if (!result_ad) { error = TRUE; sprintf( error_msg, "Error connecting to schedd %s %s: %s", ScheddAddr, dc_schedd.addr(), errstack.getFullText() ); } else { result_ad->dPrint (D_FULLDEBUG); if ( this_command == SchedDRequest::SDC_RELEASE_JOB ) { do_reschedule = true; } } // Go through the batch again, and create responses for each request this_batch.Rewind(); while (this_batch.Next(current_command)) { // Check the result char job_id_buff[30]; if (result_ad && (error == FALSE)) { sprintf (job_id_buff, "job_%d_%d", current_command->cluster_id, current_command->proc_id); int remove_result; if (result_ad->LookupInteger (job_id_buff, remove_result)) { switch (remove_result) { case AR_ERROR: error = TRUE; error_msg = "General Error"; break; case AR_SUCCESS: error = FALSE; break; case AR_NOT_FOUND: error = TRUE; error_msg = "Job not found"; break; case AR_BAD_STATUS: error = TRUE; error_msg = "Bad job status"; break; case AR_ALREADY_DONE: error = TRUE; error_msg = "Already done"; break; case AR_PERMISSION_DENIED: error = TRUE; error_msg = "Permission denied"; break; default: error = TRUE; error_msg = "Unknown Result"; } // hctiws } else { error_msg = "Unable to get result"; } // fi lookup result for job } // fi error == FALSE if (error) { dprintf (D_ALWAYS, "Error (operation: %s) %d.%d: %s\n", this_action, current_command->cluster_id, current_command->proc_id, error_msg.c_str()); const char * result[2]; result[0] = GAHP_RESULT_FAILURE; result[1] = error_msg.c_str(); enqueue_result (current_command->request_id, result, 2); } else { dprintf (D_ALWAYS, "Succeess (operation: %s) %d.%d\n", this_action, current_command->cluster_id, current_command->proc_id); const char * result[2]; result[0] = GAHP_RESULT_SUCCESS; result[1] = NULL; enqueue_result (current_command->request_id, result, 2); } // fi error // Mark the status current_command->status = SchedDRequest::SDCS_COMPLETED; } // elihw this_batch if ( result_ad ) { delete result_ad; } } dprintf (D_FULLDEBUG, "Processing JOB_STAGE_IN requests\n"); // JOB_STAGE_IN int MAX_BATCH_SIZE=1; // This should be a config param SimpleList <SchedDRequest*> stage_in_batch; do { stage_in_batch.Clear(); command_queue.Rewind(); while (command_queue.Next(current_command)) { if (current_command->status != SchedDRequest::SDCS_NEW) continue; if (current_command->command != SchedDRequest::SDC_JOB_STAGE_IN) continue; dprintf (D_ALWAYS, "Adding %d.%d to STAGE_IN batch\n", current_command->cluster_id, current_command->proc_id); stage_in_batch.Append (current_command); if (stage_in_batch.Number() >= MAX_BATCH_SIZE) break; } if (stage_in_batch.Number() > 0) { ClassAd ** array = new ClassAd*[stage_in_batch.Number()]; i=0; stage_in_batch.Rewind(); while (stage_in_batch.Next(current_command)) { array[i++] = current_command->classad; } error = FALSE; errstack.clear(); if (!dc_schedd.spoolJobFiles( stage_in_batch.Number(), array, &errstack )) { error = TRUE; sprintf( error_msg, "Error sending files to schedd %s: %s", ScheddAddr, errstack.getFullText() ); dprintf( D_ALWAYS, "%s\n", error_msg.c_str() ); } delete [] array; stage_in_batch.Rewind(); while (stage_in_batch.Next(current_command)) { current_command->status = SchedDRequest::SDCS_COMPLETED; if (error) { const char * result[] = { GAHP_RESULT_FAILURE, error_msg.c_str() }; enqueue_result (current_command->request_id, result, 2); } else { const char * result[] = { GAHP_RESULT_SUCCESS, NULL }; enqueue_result (current_command->request_id, result, 2); } } // elihw (command_queue) } // fi has STAGE_IN requests } while (stage_in_batch.Number() > 0); dprintf (D_FULLDEBUG, "Processing JOB_STAGE_OUT requests\n"); // JOB_STAGE_OUT SimpleList <SchedDRequest*> stage_out_batch; command_queue.Rewind(); while (command_queue.Next(current_command)) { if (current_command->status != SchedDRequest::SDCS_NEW) continue; if (current_command->command != SchedDRequest::SDC_JOB_STAGE_OUT) continue; stage_out_batch.Append (current_command); } if (stage_out_batch.Number() > 0) { std::string constraint = ""; stage_out_batch.Rewind(); int jobsexpected = stage_out_batch.Number(); while (stage_out_batch.Next(current_command)) { sprintf_cat( constraint, "(ClusterId==%d&&ProcId==%d)||", current_command->cluster_id, current_command->proc_id ); } constraint += "False"; error = FALSE; errstack.clear(); int jobssent; if (!dc_schedd.receiveJobSandbox( constraint.c_str(), &errstack, &jobssent )) { error = TRUE; sprintf( error_msg, "Error receiving files from schedd %s: %s", ScheddAddr, errstack.getFullText() ); dprintf( D_ALWAYS, "%s\n", error_msg.c_str() ); } if(error == FALSE && jobssent != jobsexpected) { error = TRUE; sprintf( error_msg, "Schedd %s didn't send expected files", ScheddAddr ); dprintf (D_ALWAYS, "Transfered files for %d jobs but got files for %d jobs. (Schedd %s with contraint %s\n", jobsexpected, jobssent, ScheddAddr, constraint.c_str()); } stage_out_batch.Rewind(); while (stage_out_batch.Next(current_command)) { current_command->status = SchedDRequest::SDCS_COMPLETED; if (error) { const char * result[] = { GAHP_RESULT_FAILURE, error_msg.c_str() }; enqueue_result (current_command->request_id, result, 2); } else { const char * result[] = { GAHP_RESULT_SUCCESS, NULL }; enqueue_result (current_command->request_id, result, 2); } } // elihw (command_queue) } // fi has STAGE_OUT requests dprintf (D_FULLDEBUG, "Processing JOB_REFRESH_PROXY requests\n"); CondorVersionInfo ver_info(dc_schedd.version()); bool delegate_credential; if ( ver_info.built_since_version(6,7,19) && param_boolean( "DELEGATE_JOB_GSI_CREDENTIALS", true ) ) { delegate_credential = true; } else { delegate_credential = false; } // JOB_REFRESH_PROXY command_queue.Rewind(); while (command_queue.Next(current_command)) { if (current_command->status != SchedDRequest::SDCS_NEW) continue; if (current_command->command != SchedDRequest::SDC_JOB_REFRESH_PROXY) continue; time_t expiration_time = GetDesiredDelegatedJobCredentialExpiration(current_command->classad); time_t result_expiration_time = 0; bool result; errstack.clear(); if ( delegate_credential ) { result = dc_schedd.delegateGSIcredential( current_command->cluster_id, current_command->proc_id, current_command->proxy_file, expiration_time, &result_expiration_time, &errstack ); // Currently, we do not propagate the actual resulting // expiration time back to the gridmanager. We // probably should. } else { result = dc_schedd.updateGSIcredential( current_command->cluster_id, current_command->proc_id, current_command->proxy_file, &errstack ); } current_command->status = SchedDRequest::SDCS_COMPLETED; if (result == false) { sprintf( error_msg, "Error refreshing proxy to schedd %s: %s", ScheddAddr, errstack.getFullText() ); dprintf( D_ALWAYS, "%s\n", error_msg.c_str() ); const char * result_to_queue[] = { GAHP_RESULT_FAILURE, error_msg.c_str() }; enqueue_result (current_command->request_id, result_to_queue, 2); } else { const char * result_to_queue[] = { GAHP_RESULT_SUCCESS, NULL }; enqueue_result (current_command->request_id, result_to_queue, 2); } } // Now do all the QMGMT transactions error = FALSE; // Try connecting to the queue Qmgr_connection * qmgr_connection; if ((qmgr_connection = ConnectQ(dc_schedd.addr(), QMGMT_TIMEOUT, false, NULL, NULL, dc_schedd.version() )) == NULL) { error = TRUE; sprintf( error_msg, "Error connecting to schedd %s", ScheddAddr ); dprintf( D_ALWAYS, "%s\n", error_msg.c_str() ); } else { errno = 0; AbortTransaction(); // Just so we can call BeginTransaction() in the loop if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; failure_errno = errno; goto contact_schedd_disconnect; } } dprintf (D_FULLDEBUG, "Processing UPDATE_CONSTRAINED/UDATE_JOB requests\n"); // UPDATE_CONSTRAINED // UDATE_JOB command_queue.Rewind(); while (command_queue.Next(current_command)) { if (current_command->status != SchedDRequest::SDCS_NEW) continue; if ((current_command->command != SchedDRequest::SDC_UPDATE_CONSTRAINED) && (current_command->command != SchedDRequest::SDC_UPDATE_JOB)) continue; if (qmgr_connection == NULL) goto update_report_result; error = FALSE; errno = 0; BeginTransaction(); if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; failure_errno = errno; goto contact_schedd_disconnect; } current_command->classad->ResetExpr(); ExprTree *tree; const char *lhstr, *rhstr; while( current_command->classad->NextExpr(lhstr, tree) ) { rhstr = ExprTreeToString( tree ); if( !lhstr || !rhstr) { sprintf( error_msg, "ERROR: ClassAd problem in Updating by constraint %s", current_command->constraint ); dprintf( D_ALWAYS, "%s\n", error_msg.c_str() ); error = TRUE; } else { if (current_command->command == SchedDRequest::SDC_UPDATE_CONSTRAINED) { if( SetAttributeByConstraint(current_command->constraint, lhstr, rhstr) == -1 ) { if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; failure_errno = errno; goto contact_schedd_disconnect; } sprintf( error_msg, "ERROR: Failed (errno=%d) to SetAttributeByConstraint %s=%s for constraint %s", errno, lhstr, rhstr, current_command->constraint ); dprintf( D_ALWAYS, "%s\n", error_msg.c_str() ); error = TRUE; } } else if (current_command->command == SchedDRequest::SDC_UPDATE_JOB) { if( SetAttribute(current_command->cluster_id, current_command->proc_id, lhstr, rhstr) == -1 ) { if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; failure_errno = errno; goto contact_schedd_disconnect; } sprintf( error_msg, "ERROR: Failed to SetAttribute() %s=%s for job %d.%d", lhstr, rhstr, current_command->cluster_id, current_command->proc_id); dprintf( D_ALWAYS, "%s\n", error_msg.c_str() ); error = TRUE; } } } if (error) break; } // elihw classad update_report_result: if (error) { const char * result[] = { GAHP_RESULT_FAILURE, error_msg.c_str() }; //RemoteCommitTransaction(); enqueue_result (current_command->request_id, result, 2); current_command->status = SchedDRequest::SDCS_COMPLETED; if ( qmgr_connection != NULL ) { errno = 0; AbortTransaction(); if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; failure_errno = errno; goto contact_schedd_disconnect; } } } else { if ( RemoteCommitTransaction() < 0 ) { failure_line_num = __LINE__; failure_errno = errno; goto contact_schedd_disconnect; } const char * result[] = { GAHP_RESULT_SUCCESS, NULL }; enqueue_result (current_command->request_id, result, 2); current_command->status = SchedDRequest::SDCS_COMPLETED; } // fi } // elihw dprintf (D_FULLDEBUG, "Processing UPDATE_LEASE requests\n"); // UPDATE_LEASE command_queue.Rewind(); while (command_queue.Next(current_command)) { error = FALSE; if (current_command->status != SchedDRequest::SDCS_NEW) continue; if (current_command->command != SchedDRequest::SDC_UPDATE_LEASE) continue; std::string success_job_ids=""; if (qmgr_connection == NULL) { sprintf( error_msg, "Error connecting to schedd %s", ScheddAddr ); error = TRUE; } else { error = FALSE; errno = 0; BeginTransaction(); if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; failure_errno = errno; goto contact_schedd_disconnect; } for (i=0; i<current_command->num_jobs; i++) { time_t time_now = time(NULL); int duration = current_command->expirations[i].expiration - time_now; dprintf (D_FULLDEBUG, "Job %d.%d SetTimerAttribute=%d\n", current_command->expirations[i].cluster, current_command->expirations[i].proc, duration); if (SetTimerAttribute (current_command->expirations[i].cluster, current_command->expirations[i].proc, ATTR_TIMER_REMOVE_CHECK, duration) < 0) { if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; failure_errno = errno; goto contact_schedd_disconnect; } dprintf (D_ALWAYS, "Unable to SetTimerAttribute(%d, %d), errno=%d\n", current_command->expirations[i].cluster, current_command->expirations[i].proc, errno); } else { // Append job id to the result line if (success_job_ids.length() > 0) success_job_ids += ","; sprintf_cat( success_job_ids, "%d.%d", current_command->expirations[i].cluster, current_command->expirations[i].proc); } } //rof jobs for request } // fi error if (error) { const char * result[] = { GAHP_RESULT_FAILURE, error_msg.c_str(), NULL }; //RemoteCommitTransaction(); enqueue_result (current_command->request_id, result, 3); current_command->status = SchedDRequest::SDCS_COMPLETED; if ( qmgr_connection != NULL ) { errno = 0; AbortTransaction(); if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; failure_errno = errno; goto contact_schedd_disconnect; } } } else { if ( RemoteCommitTransaction() < 0 ) { failure_line_num = __LINE__; failure_errno = errno; goto contact_schedd_disconnect; } const char * result[] = { GAHP_RESULT_SUCCESS, NULL, success_job_ids.length()?success_job_ids.c_str():NULL }; enqueue_result (current_command->request_id, result, 3); current_command->status = SchedDRequest::SDCS_COMPLETED; } // fi } // elihw UPDATE_LEASE requests dprintf (D_FULLDEBUG, "Processing SUBMIT_JOB requests\n"); // SUBMIT_JOB command_queue.Rewind(); while (command_queue.Next(current_command)) { if (current_command->status != SchedDRequest::SDCS_NEW) continue; if (current_command->command != SchedDRequest::SDC_SUBMIT_JOB) continue; int ClusterId = -1; int ProcId = -1; if (qmgr_connection == NULL) { error = TRUE; goto submit_report_result; } errno = 0; BeginTransaction(); if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; failure_errno = errno; goto contact_schedd_disconnect; } error = FALSE; if ((ClusterId = NewCluster()) >= 0) { ProcId = NewProc (ClusterId); } if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; failure_errno = errno; goto contact_schedd_disconnect; } if ( ClusterId < 0 ) { error = TRUE; error_msg = "Unable to create a new job cluster"; dprintf( D_ALWAYS, "%s\n", error_msg.c_str() ); } else if ( ProcId < 0 ) { error = TRUE; error_msg = "Unable to create a new job proc"; dprintf( D_ALWAYS, "%s\n", error_msg.c_str() ); } if ( ClusterId == -2 || ProcId == -2 ) { error = TRUE; error_msg = "Number of submitted jobs would exceed MAX_JOBS_SUBMITTED\n"; dprintf( D_ALWAYS, "%s\n", error_msg.c_str() ); } // Adjust the argument/environment syntax based on the version // of the schedd we are talking to. if( error == FALSE) { CondorVersionInfo version_info(dc_schedd.version()); ArgList arglist; MyString arg_error_msg; Env env_obj; MyString env_error_msg; if(!arglist.AppendArgsFromClassAd(current_command->classad,&arg_error_msg) || ! arglist.InsertArgsIntoClassAd(current_command->classad,&version_info,&arg_error_msg)) { sprintf( error_msg, "ERROR: ClassAd problem in converting arguments to syntax " "for schedd (version=%s): %s\n", dc_schedd.version() ? dc_schedd.version() : "NULL", arg_error_msg.Value()); dprintf( D_ALWAYS,"%s\n", error_msg.c_str() ); error = TRUE; } if(!env_obj.MergeFrom(current_command->classad,&env_error_msg) || !env_obj.InsertEnvIntoClassAd(current_command->classad,&env_error_msg,NULL,&version_info)) { sprintf( error_msg, "ERROR: Failed to convert environment to target syntax" " for schedd (version %s): %s\n", dc_schedd.version() ? dc_schedd.version() : "NULL", env_error_msg.Value()); dprintf( D_ALWAYS, "%s\n", error_msg.c_str() ); error = TRUE; } } if( error == FALSE ) { // See the comment in the function body of ExpandInputFileList // for an explanation of what is going on here. MyString transfer_input_error_msg; if( !FileTransfer::ExpandInputFileList( current_command->classad, transfer_input_error_msg ) ) { dprintf( D_ALWAYS, "%s\n", transfer_input_error_msg.Value() ); error = TRUE; } } if ( error == FALSE ) { current_command->classad->Assign(ATTR_CLUSTER_ID, ClusterId); current_command->classad->Assign(ATTR_PROC_ID, ProcId); // Special case for the job lease int expire_time; if ( current_command->classad->LookupInteger( ATTR_TIMER_REMOVE_CHECK, expire_time ) ) { if ( SetTimerAttribute( ClusterId, ProcId, ATTR_TIMER_REMOVE_CHECK, expire_time - time(NULL) ) == -1 ) { if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; failure_errno = errno; goto contact_schedd_disconnect; } sprintf( error_msg, "ERROR: Failed to SetTimerAttribute %s=%ld for job %d.%d", ATTR_TIMER_REMOVE_CHECK, expire_time - time(NULL), ClusterId, ProcId ); dprintf( D_ALWAYS, "%s\n", error_msg.c_str() ); error = TRUE; goto submit_report_result; } current_command->classad->Delete( ATTR_TIMER_REMOVE_CHECK ); } // Set all the classad attribute on the remote classad current_command->classad->ResetExpr(); ExprTree *tree; const char *lhstr, *rhstr; while( current_command->classad->NextExpr(lhstr, tree) ) { rhstr = ExprTreeToString( tree ); if( !lhstr || !rhstr) { sprintf( error_msg, "ERROR: ClassAd problem in Updating by constraint %s", current_command->constraint ); dprintf( D_ALWAYS, "%s\n", error_msg.c_str() ); error = TRUE; } else if( SetAttribute (ClusterId, ProcId, lhstr, rhstr) == -1 ) { if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; failure_errno = errno; goto contact_schedd_disconnect; } sprintf( error_msg, "ERROR: Failed to SetAttribute %s=%s for job %d.%d", lhstr, rhstr, ClusterId, ProcId ); dprintf( D_ALWAYS, "%s\n", error_msg.c_str() ); error = TRUE; } if (error) break; } // elihw classad } // fi error==FALSE submit_report_result: char job_id_buff[30]; sprintf (job_id_buff, "%d.%d", ClusterId, ProcId); if (error) { const char * result[] = { GAHP_RESULT_FAILURE, job_id_buff, error_msg.c_str() }; enqueue_result (current_command->request_id, result, 3); if ( qmgr_connection != NULL ) { errno = 0; AbortTransaction(); if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; failure_errno = errno; goto contact_schedd_disconnect; } } current_command->status = SchedDRequest::SDCS_COMPLETED; } else { if ( RemoteCommitTransaction() < 0 ) { failure_line_num = __LINE__; failure_errno = errno; goto contact_schedd_disconnect; } const char * result[] = { GAHP_RESULT_SUCCESS, job_id_buff, NULL }; enqueue_result (current_command->request_id, result, 3); current_command->status = SchedDRequest::SDCS_COMPLETED; } } // elihw dprintf (D_FULLDEBUG, "Processing STATUS_CONSTRAINED requests\n"); // STATUS_CONSTRAINED command_queue.Rewind(); while (command_queue.Next(current_command)) { if (current_command->status != SchedDRequest::SDCS_NEW) continue; if (current_command->command != SchedDRequest::SDC_STATUS_CONSTRAINED) continue; if (qmgr_connection != NULL) { SimpleList <MyString *> matching_ads; error = FALSE; ClassAd *next_ad; ClassAdList adlist; // Only use GetAllJobsByConstraint if remote schedd is // 6.9.5 or newer. Previous versions either did not // support this call, or they closed the Qmgmt connection // as a side-effect of this call. if( ver_info.built_since_version(6,9,5) ) { dprintf( D_FULLDEBUG, "Calling GetAllJobsByConstraint(%s)\n", current_command->constraint ); // NOTE: this could be made more efficient if we knew // the list of attributes to query. For lack of that, // we just get all attributes. GetAllJobsByConstraint( current_command->constraint, "", adlist); } else { // This is the old latency-prone method. dprintf( D_FULLDEBUG, "Calling GetNextJobByConstraint(%s)\n", current_command->constraint ); next_ad = GetNextJobByConstraint( current_command->constraint, 1 ); while( next_ad != NULL ) { adlist.Insert( next_ad ); next_ad = GetNextJobByConstraint( current_command->constraint, 0 ); } } // NOTE: ClassAdList will deallocate the ClassAds in it adlist.Rewind(); while( (next_ad=adlist.Next()) ) { MyString * da_buffer = new MyString(); // Use a ptr to avoid excessive copying if ( useXMLClassads ) { ClassAdXMLUnparser unparser; unparser.SetUseCompactSpacing(true); unparser.Unparse (next_ad, *da_buffer); } else { NewClassAdUnparser unparser; unparser.SetUseCompactSpacing(true); unparser.Unparse (next_ad, *da_buffer); } matching_ads.Append (da_buffer); } if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; failure_errno = errno; goto contact_schedd_disconnect; } // now output this list of classads into a result const char ** result = new const char* [matching_ads.Length() + 3]; std::string _ad_count; sprintf( _ad_count, "%d", matching_ads.Length() ); int count=0; result[count++] = GAHP_RESULT_SUCCESS; result[count++] = NULL; result[count++] = _ad_count.c_str(); MyString *next_string; matching_ads.Rewind(); while (matching_ads.Next(next_string)) { result[count++] = next_string->Value(); } enqueue_result (current_command->request_id, result, count); current_command->status = SchedDRequest::SDCS_COMPLETED; // Cleanup matching_ads.Rewind(); while (matching_ads.Next(next_string)) { delete next_string; } //CommitTransaction(); delete [] result; } else { const char * result[] = { GAHP_RESULT_FAILURE, error_msg.c_str(), "0" }; //RemoteCommitTransaction(); enqueue_result (current_command->request_id, result, 3); current_command->status = SchedDRequest::SDCS_COMPLETED; } } //elihw contact_schedd_disconnect: if ( qmgr_connection != NULL ) { DisconnectQ (qmgr_connection, FALSE); } if ( failure_line_num ) { // We had an error talking to the schedd. Take all of our // incomplete commands and mark them as failed. // TODO Consider retrying these commands, rather than // immediately marking them as failed. if ( failure_errno == ETIMEDOUT ) { dprintf( D_ALWAYS, "Timed out talking to schedd at line %d in " "doContactSchedd()\n", failure_line_num ); sprintf( error_msg, "Timed out talking to schedd" ); } else { dprintf( D_ALWAYS, "Error talking to schedd at line %d in " "doContactSchedd(), errno=%d (%s)\n", failure_line_num, failure_errno, strerror(failure_errno) ); sprintf( error_msg, "Error talking to schedd" ); } command_queue.Rewind(); while (command_queue.Next(current_command)) { if ( current_command->status != SchedDRequest::SDCS_NEW ) { continue; } switch( current_command->command ) { case SchedDRequest::SDC_UPDATE_JOB: case SchedDRequest::SDC_UPDATE_CONSTRAINED: { const char *result[2] = { GAHP_RESULT_FAILURE, error_msg.c_str() }; enqueue_result (current_command->request_id, result, 2); current_command->status = SchedDRequest::SDCS_COMPLETED; } break; case SchedDRequest::SDC_UPDATE_LEASE: { const char *result[3] = { GAHP_RESULT_FAILURE, error_msg.c_str(), NULL }; enqueue_result (current_command->request_id, result, 3); current_command->status = SchedDRequest::SDCS_COMPLETED; } break; case SchedDRequest::SDC_SUBMIT_JOB: { const char *result[3] = { GAHP_RESULT_FAILURE, "-1.-1", error_msg.c_str() }; enqueue_result (current_command->request_id, result, 3); current_command->status = SchedDRequest::SDCS_COMPLETED; } break; case SchedDRequest::SDC_STATUS_CONSTRAINED: { const char *result[3] = { GAHP_RESULT_FAILURE, error_msg.c_str(), "0" }; enqueue_result (current_command->request_id, result, 3); current_command->status = SchedDRequest::SDCS_COMPLETED; } break; default: // Do nothing ; } } } if ( do_reschedule ) { dc_schedd.reschedule(); } // Write all of our results to our parent. flush_results(); dprintf (D_FULLDEBUG, "Finishing doContactSchedd()\n"); // Clean up the list command_queue.Rewind(); while (command_queue.Next(current_command)) { if (current_command->status == SchedDRequest::SDCS_COMPLETED) { command_queue.DeleteCurrent(); delete current_command; } } // Come back soon.. // QUESTION: Should this always be a fixed time period? daemonCore->Reset_Timer( contactScheddTid, contact_schedd_interval ); }
void prettyPrint (ClassAdList &adList, TrackTotals *totals) { ppOption pps = using_print_format ? PP_CUSTOM : ppStyle; ClassAd *ad; int classad_index; int last_classad_index; bool fPrintHeadings = pm.has_headings() || (pm_head.Length() > 0); classad_index = 0; last_classad_index = adList.Length() - 1; adList.Open(); while ((ad = adList.Next())) { if (!wantOnlyTotals) { switch (pps) { case PP_STARTD_NORMAL: if (absentMode) { printStartdAbsent (ad, (classad_index == 0)); } else if( offlineMode ) { printStartdOffline( ad, (classad_index == 0)); } else { printStartdNormal (ad, (classad_index == 0)); } break; case PP_STARTD_SERVER: printServer (ad, (classad_index == 0)); break; case PP_STARTD_RUN: printRun (ad, (classad_index == 0)); break; case PP_STARTD_COD: printCOD (ad); break; case PP_STARTD_STATE: printState(ad, (classad_index == 0)); break; #ifdef HAVE_EXT_POSTGRESQL case PP_QUILL_NORMAL: printQuillNormal (ad); break; #endif /* HAVE_EXT_POSTGRESQL */ case PP_SCHEDD_NORMAL: printScheddNormal (ad, (classad_index == 0)); break; case PP_NEGOTIATOR_NORMAL: printNegotiatorNormal (ad, (classad_index == 0)); break; case PP_SCHEDD_SUBMITTORS: printScheddSubmittors (ad, (classad_index == 0)); break; case PP_VERBOSE: printVerbose (ad); break; case PP_XML: printXML (ad, (classad_index == 0), (classad_index == last_classad_index)); break; case PP_MASTER_NORMAL: printMasterNormal(ad, (classad_index == 0)); break; case PP_COLLECTOR_NORMAL: printCollectorNormal(ad, (classad_index == 0)); break; case PP_CKPT_SRVR_NORMAL: printCkptSrvrNormal(ad, (classad_index == 0)); break; case PP_STORAGE_NORMAL: printStorageNormal(ad, (classad_index == 0)); break; case PP_GRID_NORMAL: printGridNormal(ad, (classad_index == 0)); break; case PP_GENERIC_NORMAL: case PP_GENERIC: case PP_ANY_NORMAL: printAnyNormal(ad, (classad_index == 0)); break; case PP_CUSTOM: // hack: print a single item to a string, then discard the string // this makes sure that the headings line up correctly over the first // line of data. if (fPrintHeadings) { std::string tmp; pm.display(tmp, ad, targetAd); if (pm.has_headings()) { if ( ! (pmHeadFoot & HF_NOHEADER)) pm.display_Headings(stdout); } else { pm.display_Headings(stdout, pm_head); } fPrintHeadings = false; } printCustom (ad); break; case PP_NOTSET: fprintf (stderr, "Error: pretty printing set to PP_NOTSET.\n"); exit (1); default: fprintf (stderr, "Error: Unknown pretty print option.\n"); exit (1); } } classad_index++; totals->update(ad); } adList.Close(); // if there are no ads to print, but the user wanted XML output, // then print out the XML header and footer, so that naive XML // parsers won't get confused. if ( PP_XML == pps && 0 == classad_index ) { printXML (NULL, true, true); } // if totals are required, display totals if (adList.MyLength() > 0 && totals) totals->displayTotals(stdout, 20); }
void Defrag::queryDrainingCost() { ClassAdList startdAds; CondorQuery startdQuery(STARTD_AD); char const *desired_attrs[6]; desired_attrs[0] = ATTR_TOTAL_MACHINE_DRAINING_UNCLAIMED_TIME; desired_attrs[1] = ATTR_TOTAL_MACHINE_DRAINING_BADPUT; desired_attrs[2] = ATTR_DAEMON_START_TIME; desired_attrs[3] = ATTR_TOTAL_CPUS; desired_attrs[4] = ATTR_LAST_HEARD_FROM; desired_attrs[5] = NULL; startdQuery.setDesiredAttrs(desired_attrs); std::string query; // only want one ad per machine sprintf(query,"%s==1 && (%s =!= undefined || %s =!= undefined)", ATTR_SLOT_ID, ATTR_TOTAL_MACHINE_DRAINING_UNCLAIMED_TIME, ATTR_TOTAL_MACHINE_DRAINING_BADPUT); startdQuery.addANDConstraint(query.c_str()); CollectorList* collects = daemonCore->getCollectorList(); ASSERT( collects ); QueryResult result; result = collects->query(startdQuery,startdAds); if( result != Q_OK ) { dprintf(D_ALWAYS, "Couldn't fetch startd ads: %s\n", getStrQueryResult(result)); return; } double avg_badput = 0.0; double avg_unclaimed = 0.0; int total_cpus = 0; startdAds.Open(); ClassAd *startd_ad; while( (startd_ad=startdAds.Next()) ) { int unclaimed = 0; int badput = 0; int start_time = 0; int cpus = 0; int last_heard_from = 0; startd_ad->LookupInteger(ATTR_TOTAL_MACHINE_DRAINING_UNCLAIMED_TIME,unclaimed); startd_ad->LookupInteger(ATTR_TOTAL_MACHINE_DRAINING_BADPUT,badput); startd_ad->LookupInteger(ATTR_DAEMON_START_TIME,start_time); startd_ad->LookupInteger(ATTR_LAST_HEARD_FROM,last_heard_from); startd_ad->LookupInteger(ATTR_TOTAL_CPUS,cpus); int age = last_heard_from - start_time; if( last_heard_from == 0 || start_time == 0 || age <= 0 ) { continue; } avg_badput += ((double)badput)/age; avg_unclaimed += ((double)unclaimed)/age; total_cpus += cpus; } startdAds.Close(); if( total_cpus > 0 ) { avg_badput = avg_badput/total_cpus; avg_unclaimed = avg_unclaimed/total_cpus; } dprintf(D_ALWAYS,"Average pool draining badput = %.2f%%\n", avg_badput*100); dprintf(D_ALWAYS,"Average pool draining unclaimed = %.2f%%\n", avg_unclaimed*100); m_stats.AvgDrainingBadput = avg_badput; m_stats.AvgDrainingUnclaimed = avg_unclaimed; }
// callback used by fetchAds static bool fetchAds_callback(void* pv, ClassAd * ad) { ClassAdList * padList = (ClassAdList *)pv; padList->Insert (ad); return false; }
int main(int argc, char* argv[]) { Collectors = NULL; #ifdef HAVE_EXT_POSTGRESQL HistorySnapshot *historySnapshot; SQLQuery queryhor; SQLQuery queryver; QuillErrCode st; bool remotequill=false; char *quillName=NULL; AttrList *ad=0; int flag = 1; void **parameters; char *dbconn=NULL; char *completedsince = NULL; char *dbIpAddr=NULL, *dbName=NULL,*queryPassword=NULL; bool remoteread = false; #endif /* HAVE_EXT_POSTGRESQL */ const char *owner=NULL; bool readfromfile = true; bool fileisuserlog = false; char* JobHistoryFileName=NULL; const char * pcolon=NULL; GenericQuery constraint; // used to build a complex constraint. ExprTree *constraintExpr=NULL; std::string tmp; int i; myDistro->Init( argc, argv ); config(); #ifdef HAVE_EXT_POSTGRESQL parameters = (void **) malloc(NUM_PARAMETERS * sizeof(void *)); queryhor.setQuery(HISTORY_ALL_HOR, NULL); queryver.setQuery(HISTORY_ALL_VER, NULL); #endif /* HAVE_EXT_POSTGRESQL */ for(i=1; i<argc; i++) { if (is_dash_arg_prefix(argv[i],"long",1)) { longformat=TRUE; } else if (is_dash_arg_prefix(argv[i],"xml",3)) { use_xml = true; longformat = true; } else if (is_dash_arg_prefix(argv[i],"backwards",1)) { backwards=TRUE; } // must be at least -forw to avoid conflict with -f (for file) and -format else if (is_dash_arg_prefix(argv[i],"nobackwards",3) || is_dash_arg_prefix(argv[i],"forwards",4)) { backwards=FALSE; } else if (is_dash_arg_colon_prefix(argv[i],"wide", &pcolon, 1)) { wide_format=TRUE; if (pcolon) { wide_format_width = atoi(++pcolon); if ( ! mask.IsEmpty()) mask.SetOverallWidth(getDisplayWidth()-1); if (wide_format_width <= 80) wide_format = FALSE; } } else if (is_dash_arg_prefix(argv[i],"match",1) || is_dash_arg_prefix(argv[i],"limit",3)) { i++; if (argc <= i) { fprintf(stderr, "Error: Argument -match requires a number value " " as a parameter.\n"); exit(1); } specifiedMatch = atoi(argv[i]); } #ifdef HAVE_EXT_POSTGRESQL else if(is_dash_arg_prefix(argv[i], "dbname",1)) { i++; if (argc <= i) { fprintf( stderr, "Error: Argument -dbname requires the name of a quilld as a parameter\n" ); exit(1); } /* if( !(quillName = get_daemon_name(argv[i])) ) { fprintf( stderr, "Error: unknown host %s\n", get_host_part(argv[i]) ); printf("\n"); print_wrapped_text("Extra Info: The name given with the -dbname " "should be the name of a condor_quilld process. " "Normally it is either a hostname, or " "\"name@hostname\". " "In either case, the hostname should be the " "Internet host name, but it appears that it " "wasn't.", stderr); exit(1); } sprintf (tmp, "%s == \"%s\"", ATTR_NAME, quillName); quillQuery.addORConstraint (tmp); */ quillName = argv[i]; sprintf (tmp, "%s == \"%s\"", ATTR_SCHEDD_NAME, quillName); quillQuery.addORConstraint (tmp.c_str()); remotequill = false; readfromfile = false; } #endif /* HAVE_EXT_POSTGRESQL */ else if (is_dash_arg_prefix(argv[i],"file",2)) { if (i+1==argc || JobHistoryFileName) break; i++; JobHistoryFileName=argv[i]; readfromfile = true; } else if (is_dash_arg_prefix(argv[i],"userlog",1)) { if (i+1==argc || JobHistoryFileName) break; i++; JobHistoryFileName=argv[i]; readfromfile = true; fileisuserlog = true; } else if (is_dash_arg_prefix(argv[i],"help",1)) { Usage(argv[0],0); } else if (is_dash_arg_prefix(argv[i],"format",1)) { if (argc <= i + 2) { fprintf(stderr, "Error: Argument -format requires a spec and " "classad attribute name as parameters.\n"); fprintf(stderr, "\t\te.g. condor_history -format '%%d' ClusterId\n"); exit(1); } mask.registerFormat(argv[i + 1], argv[i + 2]); customFormat = true; i += 2; } else if (*(argv[i]) == '-' && (is_arg_colon_prefix(argv[i]+1,"af", &pcolon, 2) || is_arg_colon_prefix(argv[i]+1,"autoformat", &pcolon, 5))) { // make sure we have at least one argument to autoformat if (argc <= i+1 || *(argv[i+1]) == '-') { fprintf (stderr, "Error: Argument %s requires at last one attribute parameter\n", argv[i]); fprintf(stderr, "\t\te.g. condor_history %s ClusterId\n", argv[i]); exit(1); } if (pcolon) ++pcolon; // if there are options, skip over the colon to the options. int ixNext = parse_autoformat_args(argc, argv, i+1, pcolon, mask, diagnostic); if (ixNext > i) i = ixNext-1; customFormat = true; } else if (is_dash_arg_colon_prefix(argv[i], "print-format", &pcolon, 2)) { if ( (argc <= i+1) || (*(argv[i+1]) == '-' && (argv[i+1])[1] != 0)) { fprintf( stderr, "Error: Argument -print-format requires a filename argument\n"); exit( 1 ); } // hack allow -pr ! to disable use of user-default print format files. if (MATCH == strcmp(argv[i+1], "!")) { ++i; disable_user_print_files = true; continue; } if ( ! wide_format) mask.SetOverallWidth(getDisplayWidth()-1); customFormat = true; ++i; std::string where_expr; if (set_print_mask_from_stream(mask, where_expr, argv[i], true) < 0) { fprintf(stderr, "Error: cannot execute print-format file %s\n", argv[i]); exit (1); } if ( ! where_expr.empty()) { constraint.addCustomAND(where_expr.c_str()); } } else if (is_dash_arg_prefix(argv[i],"constraint",1)) { // make sure we have at least one more argument if (argc <= i+1) { fprintf( stderr, "Error: Argument %s requires another parameter\n", argv[i]); exit(1); } i++; constraint.addCustomAND(argv[i]); } #ifdef HAVE_EXT_POSTGRESQL else if (is_dash_arg_prefix(argv[i],"completedsince",3)) { i++; if (argc <= i) { fprintf(stderr, "Error: Argument -completedsince requires a date and " "optional timestamp as a parameter.\n"); fprintf(stderr, "\t\te.g. condor_history -completedsince \"2004-10-19 10:23:54\"\n"); exit(1); } if (constraint!="") break; completedsince = strdup(argv[i]); parameters[0] = completedsince; queryhor.setQuery(HISTORY_COMPLETEDSINCE_HOR,parameters); queryver.setQuery(HISTORY_COMPLETEDSINCE_VER,parameters); } #endif /* HAVE_EXT_POSTGRESQL */ else if (sscanf (argv[i], "%d.%d", &cluster, &proc) == 2) { std::string jobconst; formatstr (jobconst, "%s == %d && %s == %d", ATTR_CLUSTER_ID, cluster,ATTR_PROC_ID, proc); constraint.addCustomOR(jobconst.c_str()); #ifdef HAVE_EXT_POSTGRESQL parameters[0] = &cluster; parameters[1] = &proc; queryhor.setQuery(HISTORY_CLUSTER_PROC_HOR, parameters); queryver.setQuery(HISTORY_CLUSTER_PROC_VER, parameters); #endif /* HAVE_EXT_POSTGRESQL */ } else if (sscanf (argv[i], "%d", &cluster) == 1) { std::string jobconst; formatstr (jobconst, "%s == %d", ATTR_CLUSTER_ID, cluster); constraint.addCustomOR(jobconst.c_str()); #ifdef HAVE_EXT_POSTGRESQL parameters[0] = &cluster; queryhor.setQuery(HISTORY_CLUSTER_HOR, parameters); queryver.setQuery(HISTORY_CLUSTER_VER, parameters); #endif /* HAVE_EXT_POSTGRESQL */ } else if (is_dash_arg_prefix(argv[i],"debug",1)) { // dprintf to console dprintf_set_tool_debug("TOOL", 0); } else if (is_dash_arg_prefix(argv[i],"diagnostic",4)) { // dprintf to console diagnostic = true; } else if (is_dash_arg_prefix(argv[i], "name", 1)) { i++; if (argc <= i) { fprintf(stderr, "Error: Argument -name requires name of a remote schedd\n"); fprintf(stderr, "\t\te.g. condor_history -name submit.example.com \n"); exit(1); } g_name = argv[i]; readfromfile = false; #ifdef HAVE_EXT_POSTGRESQL remoteread = true; #endif } else if (is_dash_arg_prefix(argv[i], "pool", 1)) { i++; if (argc <= i) { fprintf(stderr, "Error: Argument -name requires name of a remote schedd\n"); fprintf(stderr, "\t\te.g. condor_history -name submit.example.com \n"); exit(1); } g_pool = argv[i]; readfromfile = false; #ifdef HAVE_EXT_POSTGRESQL remoteread = true; #endif } else { std::string ownerconst; owner = argv[i]; formatstr(ownerconst, "%s == \"%s\"", ATTR_OWNER, owner); constraint.addCustomOR(ownerconst.c_str()); #ifdef HAVE_EXT_POSTGRESQL parameters[0] = owner; queryhor.setQuery(HISTORY_OWNER_HOR, parameters); queryver.setQuery(HISTORY_OWNER_VER, parameters); #endif /* HAVE_EXT_POSTGRESQL */ } } if (i<argc) Usage(argv[0]); MyString my_constraint; constraint.makeQuery(my_constraint); if (diagnostic) { fprintf(stderr, "Using effective constraint: %s\n", my_constraint.c_str()); } if ( ! my_constraint.empty() && ParseClassAdRvalExpr( my_constraint.c_str(), constraintExpr ) ) { fprintf( stderr, "Error: could not parse constraint %s\n", my_constraint.c_str() ); exit( 1 ); } #ifdef HAVE_EXT_POSTGRESQL /* This call must happen AFTER config() is called */ if (checkDBconfig() == true && !readfromfile) { readfromfile = false; } else { readfromfile = true; } #endif /* HAVE_EXT_POSTGRESQL */ #ifdef HAVE_EXT_POSTGRESQL if(!readfromfile && !remoteread) { if(remotequill) { if (Collectors == NULL) { Collectors = CollectorList::create(); if(Collectors == NULL ) { printf("Error: Unable to get list of known collectors\n"); exit(1); } } result = Collectors->query ( quillQuery, quillList ); if(result != Q_OK) { printf("Fatal Error querying collectors\n"); exit(1); } if(quillList.MyLength() == 0) { printf("Error: Unknown quill server %s\n", quillName); exit(1); } quillList.Open(); while ((ad = quillList.Next())) { // get the address of the database dbIpAddr = dbName = queryPassword = NULL; if (!ad->LookupString(ATTR_QUILL_DB_IP_ADDR, &dbIpAddr) || !ad->LookupString(ATTR_QUILL_DB_NAME, &dbName) || !ad->LookupString(ATTR_QUILL_DB_QUERY_PASSWORD, &queryPassword) || (ad->LookupBool(ATTR_QUILL_IS_REMOTELY_QUERYABLE,flag) && !flag)) { printf("Error: The quill daemon \"%s\" is not set up " "for database queries\n", quillName); exit(1); } } } else { // they just typed 'condor_history' on the command line and want // to use quill, so get the schedd ad for the local machine if // we can, figure out the name of the schedd and the // jobqueuebirthdate Daemon schedd( DT_SCHEDD, 0, 0 ); if ( schedd.locate(Daemon::LOCATE_FULL) ) { char *scheddname = quillName; if (scheddname == NULL) { // none set explictly, look it up in the daemon ad scheddname = schedd.name(); ClassAd *daemonAd = schedd.daemonAd(); int scheddbirthdate; if(daemonAd) { if(daemonAd->LookupInteger( ATTR_JOB_QUEUE_BIRTHDATE, scheddbirthdate) ) { queryhor.setJobqueuebirthdate( (time_t)scheddbirthdate); queryver.setJobqueuebirthdate( (time_t)scheddbirthdate); } } } else { queryhor.setJobqueuebirthdate(0); queryver.setJobqueuebirthdate(0); } queryhor.setScheddname(scheddname); queryver.setScheddname(scheddname); } } dbconn = getDBConnStr(quillName,dbIpAddr,dbName,queryPassword); historySnapshot = new HistorySnapshot(dbconn); if (!customFormat) { printf ("\n\n-- Quill: %s : %s : %s\n", quillName, dbIpAddr, dbName); } queryhor.prepareQuery(); // create the query strings before sending off to historySnapshot queryver.prepareQuery(); st = historySnapshot->sendQuery(&queryhor, &queryver, longformat, false, customFormat, &mask, constraint.c_str()); //if there's a failure here and if we're not posing a query on a //remote quill daemon, we should instead query the local file if(st == QUILL_FAILURE) { printf( "-- Database at %s not reachable\n", dbIpAddr); if(!remotequill) { char *tmp_hist = param("HISTORY"); if (!customFormat) { printf( "--Failing over to the history file at %s instead --\n", tmp_hist ? tmp_hist : "(null)" ); } if(!tmp_hist) { free(tmp_hist); } readfromfile = true; } } // query history table if (historySnapshot->isHistoryEmpty()) { printf("No historical jobs in the database match your query\n"); } historySnapshot->release(); delete(historySnapshot); } #endif /* HAVE_EXT_POSTGRESQL */ if(readfromfile == true) { readHistoryFromFiles(fileisuserlog, JobHistoryFileName, my_constraint.c_str(), constraintExpr); } else { readHistoryRemote(constraintExpr); } #ifdef HAVE_EXT_POSTGRESQL if(completedsince) free(completedsince); if(parameters) free(parameters); if(dbIpAddr) free(dbIpAddr); if(dbName) free(dbName); if(queryPassword) free(queryPassword); if(dbconn) free(dbconn); #endif return 0; }
void VMRegister::requestHostClassAds(void) { // find host startd daemon if( !m_vm_host_daemon ) m_vm_host_daemon = vmapi_findDaemon( m_vm_host_name, DT_STARTD); if( !m_vm_host_daemon ) { dprintf( D_FULLDEBUG, "Can't find host(%s) Startd daemon\n", m_vm_host_name ); return; } ClassAd query_ad; query_ad.SetMyTypeName(QUERY_ADTYPE); query_ad.SetTargetTypeName(STARTD_ADTYPE); query_ad.Assign(ATTR_REQUIREMENTS, true); char *addr = m_vm_host_daemon->addr(); Daemon hstartd(DT_STARTD, addr); ReliSock ssock; ssock.timeout( VM_SOCKET_TIMEOUT ); ssock.encode(); if( !ssock.connect(addr) ) { dprintf( D_FULLDEBUG, "Failed to connect to host startd(%s)\n to get host classAd", addr); return; } if(!hstartd.startCommand( QUERY_STARTD_ADS, &ssock )) { dprintf( D_FULLDEBUG, "Failed to send QUERY_STARTD_ADS command to host startd(%s)\n", addr); return; } if( !query_ad.put(ssock) ) { dprintf(D_FULLDEBUG, "Failed to send query Ad to host startd(%s)\n", addr); } if( !ssock.end_of_message() ) { dprintf(D_FULLDEBUG, "Failed to send query EOM to host startd(%s)\n", addr); } // Read host classAds ssock.timeout( VM_SOCKET_TIMEOUT ); ssock.decode(); int more = 1, num_ads = 0; ClassAdList adList; ClassAd *ad; while (more) { if( !ssock.code(more) ) { ssock.end_of_message(); return; } if(more) { ad = new ClassAd; if( !ad->initFromStream(ssock) ) { ssock.end_of_message(); delete ad; return; } adList.Insert(ad); num_ads++; } } ssock.end_of_message(); dprintf(D_FULLDEBUG, "Got %d classAds from host\n", num_ads); // Although we can get more than one classAd from host machine, // we use only the first one classAd adList.Rewind(); ad = adList.Next(); #if !defined(WANT_OLD_CLASSADS) ad->AddTargetRefs( TargetJobAttrs ); #endif // Get each Attribute from the classAd // added "HOST_" in front of each Attribute name const char *name; ExprTree *expr; ad->ResetExpr(); while( ad->NextExpr(name, expr) ) { MyString attr; attr += "HOST_"; attr += name; // Insert or Update an attribute to host_classAd in a VMRegister object ExprTree * pTree = expr->Copy(); host_classad->Insert(attr.Value(), pTree, true); } }
void Defrag::poll() { dprintf(D_FULLDEBUG,"Evaluating defragmentation policy.\n"); // If we crash during this polling cycle, we will have saved // the time of the last poll, so the next cycle will be // scheduled on the false assumption that a cycle ran now. In // this way, we error on the side of draining too little // rather than too much. time_t now = time(NULL); time_t prev = m_last_poll; m_last_poll = now; saveState(); m_stats.Tick(); int num_to_drain = m_draining_per_poll; time_t last_hour = (prev / 3600)*3600; time_t current_hour = (now / 3600)*3600; time_t last_day = (prev / (3600*24))*3600*24; time_t current_day = (now / (3600*24))*3600*24; if( current_hour != last_hour ) { num_to_drain += prorate(m_draining_per_poll_hour,now-current_hour,3600,m_polling_interval); } if( current_day != last_day ) { num_to_drain += prorate(m_draining_per_poll_day,now-current_day,3600*24,m_polling_interval); } int num_draining = countMachines(DRAINING_CONSTRAINT,"<InternalDrainingConstraint>"); m_stats.MachinesDraining = num_draining; MachineSet whole_machines; int num_whole_machines = countMachines(m_whole_machine_expr.c_str(),"DEFRAG_WHOLE_MACHINE_EXPR",&whole_machines); m_stats.WholeMachines = num_whole_machines; dprintf(D_ALWAYS,"There are currently %d draining and %d whole machines.\n", num_draining,num_whole_machines); queryDrainingCost(); // If possible, cancel some drains. MachineSet cancelled_machines; poll_cancel(cancelled_machines); if( num_to_drain <= 0 ) { dprintf(D_ALWAYS,"Doing nothing, because number to drain in next %ds is calculated to be 0.\n", m_polling_interval); return; } if( (int)ceil(m_draining_per_hour) <= 0 ) { dprintf(D_ALWAYS,"Doing nothing, because DEFRAG_DRAINING_MACHINES_PER_HOUR=%f\n", m_draining_per_hour); return; } if( m_max_draining == 0 ) { dprintf(D_ALWAYS,"Doing nothing, because DEFRAG_MAX_CONCURRENT_DRAINING=0\n"); return; } if( m_max_whole_machines == 0 ) { dprintf(D_ALWAYS,"Doing nothing, because DEFRAG_MAX_WHOLE_MACHINES=0\n"); return; } if( m_max_draining >= 0 ) { if( num_draining >= m_max_draining ) { dprintf(D_ALWAYS,"Doing nothing, because DEFRAG_MAX_CONCURRENT_DRAINING=%d and there are %d draining machines.\n", m_max_draining, num_draining); return; } else if( num_draining < 0 ) { dprintf(D_ALWAYS,"Doing nothing, because DEFRAG_MAX_CONCURRENT_DRAINING=%d and the query to count draining machines failed.\n", m_max_draining); return; } } if( m_max_whole_machines >= 0 ) { if( num_whole_machines >= m_max_whole_machines ) { dprintf(D_ALWAYS,"Doing nothing, because DEFRAG_MAX_WHOLE_MACHINES=%d and there are %d whole machines.\n", m_max_whole_machines, num_whole_machines); return; } } // Even if m_max_whole_machines is -1 (infinite), we still need // the list of whole machines in order to filter them out in // the draining selection algorithm, so abort now if the // whole machine query failed. if( num_whole_machines < 0 ) { dprintf(D_ALWAYS,"Doing nothing, because the query to find whole machines failed.\n"); return; } dprintf(D_ALWAYS,"Looking for %d machines to drain.\n",num_to_drain); ClassAdList startdAds; std::string requirements; sprintf(requirements,"(%s) && Draining =!= true",m_defrag_requirements.c_str()); if( !queryMachines(requirements.c_str(),"DEFRAG_REQUIREMENTS",startdAds) ) { dprintf(D_ALWAYS,"Doing nothing, because the query to select machines matching DEFRAG_REQUIREMENTS failed.\n"); return; } startdAds.Shuffle(); startdAds.Sort(StartdSortFunc,&m_rank_ad); startdAds.Open(); int num_drained = 0; ClassAd *startd_ad_ptr; MachineSet machines_done; while( (startd_ad_ptr=startdAds.Next()) ) { if (!startd_ad_ptr) continue; ClassAd &startd_ad = *startd_ad_ptr; std::string machine; std::string name; startd_ad.LookupString(ATTR_NAME,name); slotNameToDaemonName(name,machine); // If we have already cancelled draining on this machine, ignore it for this cycle. if( cancelled_machines.count(machine) ) { dprintf(D_FULLDEBUG, "Skipping %s: already cancelled draining of %s in this cycle.\n", name.c_str(),machine.c_str()); continue; } if( machines_done.count(machine) ) { dprintf(D_FULLDEBUG, "Skipping %s: already attempted to drain %s in this cycle.\n", name.c_str(),machine.c_str()); continue; } if( whole_machines.count(machine) ) { dprintf(D_FULLDEBUG, "Skipping %s: because it is already running as a whole machine.\n", name.c_str()); continue; } if( drain(startd_ad) ) { machines_done.insert(machine); if( ++num_drained >= num_to_drain ) { dprintf(D_ALWAYS, "Drained maximum number of machines allowed in this cycle (%d).\n", num_to_drain); break; } } } startdAds.Close(); dprintf(D_ALWAYS,"Drained %d machines (wanted to drain %d machines).\n", num_drained,num_drained); dprintf(D_FULLDEBUG,"Done evaluating defragmentation policy.\n"); }
void Defrag::poll() { dprintf(D_FULLDEBUG,"Evaluating defragmentation policy.\n"); // If we crash during this polling cycle, we will have saved // the time of the last poll, so the next cycle will be // scheduled on the false assumption that a cycle ran now. In // this way, we error on the side of draining too little // rather than too much. time_t now = time(NULL); time_t prev = m_last_poll; m_last_poll = now; saveState(); m_stats.Tick(); int num_to_drain = m_draining_per_poll; time_t last_hour = (prev / 3600)*3600; time_t current_hour = (now / 3600)*3600; time_t last_day = (prev / (3600*24))*3600*24; time_t current_day = (now / (3600*24))*3600*24; if( current_hour != last_hour ) { num_to_drain += prorate(m_draining_per_poll_hour,now-current_hour,3600,m_polling_interval); } if( current_day != last_day ) { num_to_drain += prorate(m_draining_per_poll_day,now-current_day,3600*24,m_polling_interval); } MachineSet draining_machines; int num_draining = countMachines(DRAINING_CONSTRAINT,"<InternalDrainingConstraint>", &draining_machines); m_stats.MachinesDraining = num_draining; MachineSet whole_machines; int num_whole_machines = countMachines(m_whole_machine_expr.c_str(),"DEFRAG_WHOLE_MACHINE_EXPR",&whole_machines); m_stats.WholeMachines = num_whole_machines; dprintf(D_ALWAYS,"There are currently %d draining and %d whole machines.\n", num_draining,num_whole_machines); // Calculate arrival rate of fully drained machines. This is a bit tricky because we poll. // We count by finding the newly-arrived // fully drained machines, and add to that count machines which are no-longer draining. // This allows us to find machines that have fully drained, but were then claimed between // polling cycles. MachineSet new_machines; MachineSet no_longer_whole_machines; // Find newly-arrived machines std::set_difference(whole_machines.begin(), whole_machines.end(), m_prev_whole_machines.begin(), m_prev_whole_machines.end(), std::inserter(new_machines, new_machines.begin())); // Now, newly-departed machines std::set_difference(m_prev_draining_machines.begin(), m_prev_draining_machines.end(), draining_machines.begin(), draining_machines.end(), std::inserter(no_longer_whole_machines, no_longer_whole_machines.begin())); dprintf_set("Set of current whole machines is ", &whole_machines); dprintf_set("Set of current draining machine is ", &draining_machines); dprintf_set("Newly Arrived whole machines is ", &new_machines); dprintf_set("Newly departed draining machines is ", &no_longer_whole_machines); m_prev_draining_machines = draining_machines; m_prev_whole_machines = whole_machines; int newly_drained = new_machines.size() + no_longer_whole_machines.size(); double arrival_rate = 0.0; // If there is an arrival... if (newly_drained > 0) { time_t current = time(0); // And it isn't the first one since defrag boot... if (m_last_whole_machine_arrival > 0) { m_whole_machines_arrived += newly_drained; time_t arrival_time = current - m_last_whole_machine_arrival; if (arrival_time < 1) arrival_time = 1; // very unlikely, but just in case m_whole_machine_arrival_sum += newly_drained * arrival_time; arrival_rate = newly_drained / ((double)arrival_time); dprintf(D_ALWAYS, "Arrival rate is %g machines/hour\n", arrival_rate * 3600.0); } m_last_whole_machine_arrival = current; } dprintf(D_ALWAYS, "Lifetime whole machines arrived: %d\n", m_whole_machines_arrived); if (m_whole_machine_arrival_sum > 0) { double lifetime_mean = m_whole_machines_arrived / m_whole_machine_arrival_sum; dprintf(D_ALWAYS, "Lifetime mean arrival rate: %g machines / hour\n", 3600.0 * lifetime_mean); if (newly_drained > 0) { double diff = arrival_rate - lifetime_mean; m_whole_machine_arrival_mean_squared += diff * diff; } double sd = sqrt(m_whole_machine_arrival_mean_squared / m_whole_machines_arrived); dprintf(D_ALWAYS, "Lifetime mean arrival rate sd: %g\n", sd * 3600); m_stats.MeanDrainedArrival = lifetime_mean; m_stats.MeanDrainedArrivalSD = sd; m_stats.DrainedMachines = m_whole_machines_arrived; } queryDrainingCost(); // If possible, cancel some drains. MachineSet cancelled_machines; poll_cancel(cancelled_machines); if( num_to_drain <= 0 ) { dprintf(D_ALWAYS,"Doing nothing, because number to drain in next %ds is calculated to be 0.\n", m_polling_interval); return; } if( (int)ceil(m_draining_per_hour) <= 0 ) { dprintf(D_ALWAYS,"Doing nothing, because DEFRAG_DRAINING_MACHINES_PER_HOUR=%f\n", m_draining_per_hour); return; } if( m_max_draining == 0 ) { dprintf(D_ALWAYS,"Doing nothing, because DEFRAG_MAX_CONCURRENT_DRAINING=0\n"); return; } if( m_max_whole_machines == 0 ) { dprintf(D_ALWAYS,"Doing nothing, because DEFRAG_MAX_WHOLE_MACHINES=0\n"); return; } if( m_max_draining >= 0 ) { if( num_draining >= m_max_draining ) { dprintf(D_ALWAYS,"Doing nothing, because DEFRAG_MAX_CONCURRENT_DRAINING=%d and there are %d draining machines.\n", m_max_draining, num_draining); return; } else if( num_draining < 0 ) { dprintf(D_ALWAYS,"Doing nothing, because DEFRAG_MAX_CONCURRENT_DRAINING=%d and the query to count draining machines failed.\n", m_max_draining); return; } } if( m_max_whole_machines >= 0 ) { if( num_whole_machines >= m_max_whole_machines ) { dprintf(D_ALWAYS,"Doing nothing, because DEFRAG_MAX_WHOLE_MACHINES=%d and there are %d whole machines.\n", m_max_whole_machines, num_whole_machines); return; } } // Even if m_max_whole_machines is -1 (infinite), we still need // the list of whole machines in order to filter them out in // the draining selection algorithm, so abort now if the // whole machine query failed. if( num_whole_machines < 0 ) { dprintf(D_ALWAYS,"Doing nothing, because the query to find whole machines failed.\n"); return; } dprintf(D_ALWAYS,"Looking for %d machines to drain.\n",num_to_drain); ClassAdList startdAds; std::string requirements; formatstr(requirements,"(%s) && Draining =!= true",m_defrag_requirements.c_str()); if( !queryMachines(requirements.c_str(),"DEFRAG_REQUIREMENTS",startdAds) ) { dprintf(D_ALWAYS,"Doing nothing, because the query to select machines matching DEFRAG_REQUIREMENTS failed.\n"); return; } startdAds.Shuffle(); startdAds.Sort(StartdSortFunc,&m_rank_ad); startdAds.Open(); int num_drained = 0; ClassAd *startd_ad_ptr; MachineSet machines_done; while( (startd_ad_ptr=startdAds.Next()) ) { ClassAd &startd_ad = *startd_ad_ptr; std::string machine; std::string name; startd_ad.LookupString(ATTR_NAME,name); slotNameToDaemonName(name,machine); // If we have already cancelled draining on this machine, ignore it for this cycle. if( cancelled_machines.count(machine) ) { dprintf(D_FULLDEBUG, "Skipping %s: already cancelled draining of %s in this cycle.\n", name.c_str(),machine.c_str()); continue; } if( machines_done.count(machine) ) { dprintf(D_FULLDEBUG, "Skipping %s: already attempted to drain %s in this cycle.\n", name.c_str(),machine.c_str()); continue; } if( whole_machines.count(machine) ) { dprintf(D_FULLDEBUG, "Skipping %s: because it is already running as a whole machine.\n", name.c_str()); continue; } if( drain(startd_ad) ) { machines_done.insert(machine); if( ++num_drained >= num_to_drain ) { dprintf(D_ALWAYS, "Drained maximum number of machines allowed in this cycle (%d).\n", num_to_drain); break; } } } startdAds.Close(); dprintf(D_ALWAYS,"Drained %d machines (wanted to drain %d machines).\n", num_drained,num_to_drain); dprintf(D_FULLDEBUG,"Done evaluating defragmentation policy.\n"); }