void OfflineCollectorPlugin::mergeClassAd ( ClassAd &ad, char const *key ) { ClassAd *old_ad = NULL; if (!_ads) return; _ads->BeginTransaction (); if ( !_ads->LookupClassAd ( key, old_ad ) ) { _ads->AbortTransaction (); return; } ad.ResetExpr(); ExprTree *expr; const char *attr_name; while (ad.NextExpr(attr_name, expr)) { MyString new_val; MyString old_val; ASSERT( attr_name && expr ); new_val = ExprTreeToString( expr ); expr = old_ad->LookupExpr( attr_name ); if( expr ) { old_val = ExprTreeToString( expr ); if( new_val == old_val ) { continue; } } // filter out stuff we never want to mess with if( !strcasecmp(attr_name,ATTR_MY_TYPE) || !strcasecmp(attr_name,ATTR_TARGET_TYPE) || !strcasecmp(attr_name,ATTR_AUTHENTICATED_IDENTITY) ) { continue; } _ads->SetAttribute(key, attr_name, new_val.Value()); } _ads->CommitTransaction (); }
static void construct_custom_attributes( MyString &attributes, ClassAd* job_ad ) { attributes = ""; bool first_time = true; char *tmp = NULL; job_ad->LookupString( ATTR_EMAIL_ATTRIBUTES, &tmp ); if( ! tmp ) { return; } StringList email_attrs; email_attrs.initializeFromString( tmp ); free( tmp ); tmp = NULL; ExprTree* expr_tree; email_attrs.rewind(); while( (tmp = email_attrs.next()) ) { expr_tree = job_ad->LookupExpr(tmp); if( ! expr_tree ) { dprintf(D_ALWAYS, "Custom email attribute (%s) is undefined.", tmp); continue; } if( first_time ) { attributes.formatstr_cat( "\n\n" ); first_time = false; } attributes.formatstr_cat( "%s = %s\n", tmp, ExprTreeToString(expr_tree) ); } return; }
int CondorQ::fetchQueueFromDB (ClassAdList &list, char *&lastUpdate, const char *dbconn, CondorError* /*errstack*/) { #ifndef HAVE_EXT_POSTGRESQL (void) list; (void) lastUpdate; (void) dbconn; #else int result; JobQueueSnapshot *jqSnapshot; const char *constraint; ClassAd *ad; QuillErrCode rv; ExprTree *tree; jqSnapshot = new JobQueueSnapshot(dbconn); rv = jqSnapshot->startIterateAllClassAds(clusterarray, numclusters, procarray, numprocs, schedd, FALSE, scheddBirthdate, lastUpdate); if (rv == QUILL_FAILURE) { delete jqSnapshot; return Q_COMMUNICATION_ERROR; } else if (rv == JOB_QUEUE_EMPTY) { delete jqSnapshot; return Q_OK; } // make the query ad if ((result = query.makeQuery (tree)) != Q_OK) { delete jqSnapshot; return result; } constraint = ExprTreeToString(tree); delete tree; ad = getDBNextJobByConstraint(constraint, jqSnapshot); while (ad != (ClassAd *) 0) { ad->ChainCollapse(); list.Insert(ad); ad = getDBNextJobByConstraint(constraint, jqSnapshot); } delete jqSnapshot; #endif /* HAVE_EXT_POSTGRESQL */ return Q_OK; }
void ClassAdLog::LogState(FILE *fp) { LogRecord *log=NULL; ClassAd *ad=NULL; ExprTree *expr=NULL; HashKey hashval; MyString key; const char *attr_name = NULL; // This must always be the first entry in the log. log = new LogHistoricalSequenceNumber( historical_sequence_number, m_original_log_birthdate ); if (log->Write(fp) < 0) { EXCEPT("write to %s failed, errno = %d", logFilename(), errno); } delete log; table.startIterations(); while(table.iterate(ad) == 1) { table.getCurrentKey(hashval); hashval.sprint(key); log = new LogNewClassAd(key.Value(), ad->GetMyTypeName(), ad->GetTargetTypeName()); if (log->Write(fp) < 0) { EXCEPT("write to %s failed, errno = %d", logFilename(), errno); } delete log; // Unchain the ad -- we just want to write out this ads exprs, // not all the exprs in the chained ad as well. AttrList *chain = dynamic_cast<AttrList*>(ad->GetChainedParentAd()); ad->Unchain(); ad->ResetName(); attr_name = ad->NextNameOriginal(); while (attr_name) { expr = ad->LookupExpr(attr_name); // This conditional used to check whether the ExprTree is // invisible, but no codepath sets any attributes // invisible for this call. if (expr) { log = new LogSetAttribute(key.Value(), attr_name, ExprTreeToString(expr)); if (log->Write(fp) < 0) { EXCEPT("write to %s failed, errno = %d", logFilename(), errno); } delete log; } attr_name = ad->NextNameOriginal(); } // ok, now that we're done writing out this ad, restore the chain ad->ChainToAd(chain); } if (fflush(fp) !=0){ EXCEPT("fflush of %s failed, errno = %d", logFilename(), errno); } if (condor_fsync(fileno(fp)) < 0) { EXCEPT("fsync of %s failed, errno = %d", logFilename(), errno); } }
void EmitExpression(unsigned int mode, const char *attr, ExprTree* attr_expr) { if (attr_expr == NULL) { dprintf(mode, "%s = UNDEFINED\n", attr); } else { dprintf(mode, "%s = %s\n", attr, ExprTreeToString(attr_expr)); } }
int CondorQ::fetchQueueFromHostAndProcess ( const char *host, StringList &attrs, int fetch_opts, int match_limit, condor_q_process_func process_func, void * process_func_data, int useFastPath, CondorError* errstack, ClassAd ** psummary_ad) { Qmgr_connection *qmgr; ExprTree *tree; char *constraint; int result; // make the query ad if ((result = query.makeQuery (tree)) != Q_OK) return result; constraint = strdup( ExprTreeToString( tree ) ); delete tree; if (useFastPath > 1) { int result = fetchQueueFromHostAndProcessV2(host, constraint, attrs, fetch_opts, match_limit, process_func, process_func_data, connect_timeout, useFastPath, errstack, psummary_ad); free( constraint); return result; } if (fetch_opts != fetch_Jobs) { free( constraint ); return Q_UNSUPPORTED_OPTION_ERROR; } /* connect to the Q manager. use a timeout of 20 seconds, and a read-only connection. why 20 seconds? because careful research by Derek has shown that whenever one needs a periodic time value, 20 is always optimal. :^). */ init(); // needed to get default connect_timeout if( !(qmgr = ConnectQ( host, connect_timeout, true, errstack)) ) { free( constraint ); return Q_SCHEDD_COMMUNICATION_ERROR; } // get the ads and filter them result = getFilterAndProcessAds (constraint, attrs, match_limit, process_func, process_func_data, useFastPath); DisconnectQ (qmgr); free( constraint ); return result; }
bool VMType::createConfigUsingScript(const char* configfile) { vmprintf(D_FULLDEBUG, "Inside VMType::createConfigUsingScript\n"); if( !configfile || m_scriptname.IsEmpty() ) { return false; } // Set temporary environments for script program StringList name_list; const char *name; ExprTree* expr = NULL; m_classAd.ResetExpr(); while( m_classAd.NextExpr(name, expr) ) { if( !strncasecmp( name, "JobVM", strlen("JobVM") ) || !strncasecmp( name, "VMPARAM", strlen("VMPARAM") )) { name_list.append(name); SetEnv(name, ExprTreeToString(expr)); } } ArgList systemcmd; if( m_prog_for_script.IsEmpty() == false ) { systemcmd.AppendArg(m_prog_for_script); } systemcmd.AppendArg(m_scriptname); systemcmd.AppendArg("createconfig"); systemcmd.AppendArg(configfile); int result = systemCommand(systemcmd, m_file_owner); // UnSet temporary environments for script program const char *tmp_name = NULL; name_list.rewind(); while( (tmp_name = name_list.next()) != NULL ) { UnsetEnv(tmp_name); } if( result != 0 ) { vmprintf(D_ALWAYS, "Failed to create Configuration file('%s') using " "script program('%s')\n", configfile, m_scriptname.Value()); return false; } return true; }
bool ClassAdCollection::NewClassAd(const char* key, ClassAd* ad) { LogRecord* log=new LogNewClassAd(key,GetMyTypeName(*ad),GetTargetTypeName(*ad)); ClassAdLog::AppendLog(log); const char *name; ExprTree* expr; ad->ResetExpr(); while (ad->NextExpr(name, expr)) { LogRecord* l=new LogSetAttribute(key,name,ExprTreeToString(expr)); ClassAdLog::AppendLog(l); } // return AddClassAd(0,key); return true; }
bool ClassAdCollection::CheckClassAd(BaseCollection* Coll,const MyString& OID, ClassAd* Ad) { if (Coll->Type()==PartitionParent_e) { PartitionParent* ParentColl=(PartitionParent*) Coll; StringSet Values; MyString AttrName; MyString AttrValue; ParentColl->Attributes.StartIterations(); // printf("Checking OID %s\n",OID.Value()); while (ParentColl->Attributes.Iterate(AttrName)) { ExprTree* expr=Ad->LookupExpr(AttrName.Value()); if (expr) { AttrValue = ExprTreeToString( expr ); } else { AttrValue = ""; } Values.Add(AttrValue); } // Values.StartIterations(); while (Values.Iterate(AttrValue)) { printf("Val: AttrValue=%s\n",AttrValue.Value()); } int CoID; PartitionChild* ChildColl=NULL; ParentColl->Children.StartIterations(); while (ParentColl->Children.Iterate(CoID)) { if (Collections.lookup(CoID,Coll)==-1) continue; ChildColl=(PartitionChild*) Coll; // ChildColl->Values.StartIterations(); while (ChildColl->Values.Iterate(AttrValue)) { printf("ChildVal: AttrValue=%s\n",AttrValue.Value()); } if (EqualSets(ChildColl->Values,Values)) break; ChildColl=NULL; } if (ChildColl==NULL) { // Create a new child collection ChildColl=new PartitionChild(ParentColl->Rank,Values); CoID=LastCoID+1; if (Collections.insert(CoID,ChildColl)==-1) return false; LastCoID=CoID; // Add to parent's children ParentColl->Children.Add(CoID); } // Add to child AddClassAd(CoID,OID,Ad); return false; } else { return Coll->CheckClassAd(Ad); } }
int CondorQ:: fetchQueue (ClassAdList &list, StringList &attrs, ClassAd *ad, CondorError* errstack) { Qmgr_connection *qmgr; ExprTree *tree; int result; char scheddString [32]; const char *constraint; bool useFastPath = false; // make the query ad if ((result = query.makeQuery (tree)) != Q_OK) return result; constraint = ExprTreeToString( tree ); delete tree; // connect to the Q manager init(); // needed to get default connect_timeout if (ad == 0) { // local case if( !(qmgr = ConnectQ( 0, connect_timeout, true, errstack)) ) { errstack->push("TEST", 0, "FOO"); return Q_SCHEDD_COMMUNICATION_ERROR; } useFastPath = true; } else { // remote case to handle condor_globalq if (!ad->LookupString (ATTR_SCHEDD_IP_ADDR, scheddString, sizeof(scheddString))) return Q_NO_SCHEDD_IP_ADDR; if( !(qmgr = ConnectQ( scheddString, connect_timeout, true, errstack)) ) return Q_SCHEDD_COMMUNICATION_ERROR; } // get the ads and filter them getAndFilterAds (constraint, attrs, list, useFastPath); DisconnectQ (qmgr); return Q_OK; }
int CondorQ:: fetchQueueFromHost (ClassAdList &list, StringList &attrs, const char *host, char const *schedd_version, CondorError* errstack) { Qmgr_connection *qmgr; ExprTree *tree; const char *constraint; int result; // make the query ad if ((result = query.makeQuery (tree)) != Q_OK) return result; constraint = ExprTreeToString( tree ); delete tree; /* connect to the Q manager. use a timeout of 20 seconds, and a read-only connection. why 20 seconds? because careful research by Derek has shown that whenever one needs a periodic time value, 20 is always optimal. :^). */ init(); // needed to get default connect_timeout if( !(qmgr = ConnectQ( host, connect_timeout, true, errstack)) ) return Q_SCHEDD_COMMUNICATION_ERROR; int useFastPath = 0; if( schedd_version && *schedd_version ) { CondorVersionInfo v(schedd_version); useFastPath = v.built_since_version(6,9,3) ? 1 : 0; if (v.built_since_version(8, 1, 5)) { useFastPath = 2; } } // get the ads and filter them result = getAndFilterAds (constraint, attrs, -1, list, useFastPath); DisconnectQ (qmgr); return result; }
int JobTransforms::set_dirty_attributes(ClassAd *ad, int cluster, int proc) { int num_attrs_set = 0; const char *rhstr = 0; ExprTree * tree; for ( classad::ClassAd::dirtyIterator it = ad->dirtyBegin(); it != ad->dirtyEnd(); ++it ) { rhstr = NULL; tree = ad->Lookup( *it ); if ( tree ) { rhstr = ExprTreeToString( tree ); } else { // If an attribute is marked as dirty but Lookup() on this // attribute fails, it means that attribute was deleted. // We handle this by inserting into the transaction log a // SetAttribute to UNDEFINED, which will work properly even if we // are dealing with a chained job ad that has the attribute set differently // in the cluster ad. rhstr = "UNDEFINED"; } if( !rhstr) { dprintf(D_ALWAYS,"(%d.%d) job_transforms: Problem processing classad\n", cluster, proc); return -1; } dprintf(D_FULLDEBUG, "(%d.%d) job_transforms: Setting %s = %s\n", cluster, proc, it->c_str(), rhstr); if( SetAttribute(cluster, proc, it->c_str(), rhstr) == -1 ) { dprintf(D_ALWAYS,"(%d.%d) job_transforms: Failed to set %s = %s\n", cluster, proc, it->c_str(), rhstr); return -2; } num_attrs_set++; } return num_attrs_set; }
int pseudo_get_job_attr( const char *name, MyString &expr ) { RemoteResource *remote; if (parallelMasterResource == NULL) { remote = thisRemoteResource; } else { remote = parallelMasterResource; } ClassAd *ad = remote->getJobAd(); ExprTree *e = ad->LookupExpr(name); if(e) { expr = ExprTreeToString(e); dprintf(D_SYSCALLS,"pseudo_get_job_attr(%s) = %s\n",name,expr.Value()); return 0; } else { dprintf(D_SYSCALLS,"pseudo_get_job_attr(%s) is UNDEFINED\n",name); expr = "UNDEFINED"; return 0; } }
int CondorQ::fetchQueueFromHostAndProcess ( const char *host, StringList &attrs, process_function process_func, bool useFastPath, CondorError* errstack) { Qmgr_connection *qmgr; ExprTree *tree; char *constraint; int result; // make the query ad if ((result = query.makeQuery (tree)) != Q_OK) return result; constraint = strdup( ExprTreeToString( tree ) ); delete tree; /* connect to the Q manager. use a timeout of 20 seconds, and a read-only connection. why 20 seconds? because careful research by Derek has shown that whenever one needs a periodic time value, 20 is always optimal. :^). */ init(); // needed to get default connect_timeout if( !(qmgr = ConnectQ( host, connect_timeout, true, errstack)) ) { free( constraint ); return Q_SCHEDD_COMMUNICATION_ERROR; } // get the ads and filter them result = getFilterAndProcessAds (constraint, attrs, process_func, useFastPath); DisconnectQ (qmgr); free( constraint ); return result; }
const char *ExprTreeToString( const classad::ExprTree *expr ) { static std::string buffer; buffer = ""; return ExprTreeToString(expr, buffer); }
// TODO: this code doesn't work as expected // everything seems to get set to EXPR_TYPE bool LiveJobImpl::Get ( const char *_name, const Attribute *&_attribute ) const { // our job ad is chained so lookups will // encompass our parent ad as well as the child // parse the type ExprTree *expr = NULL; if ( ! ( expr = m_full_ad->Lookup ( _name ) ) ) { dprintf ( D_FULLDEBUG, "warning: failed to lookup attribute %s in job '%s'\n", _name, m_job->GetKey() ); return false; } // decode the type classad::Value value; m_full_ad->EvaluateExpr(expr,value); switch ( value.GetType() ) { case classad::Value::INTEGER_VALUE: { int i; if ( !m_full_ad->LookupInteger ( _name, i ) ) { return false; } const char* int_str = to_string<int> ( i,dec ).c_str(); _attribute = new Attribute ( Attribute::INTEGER_TYPE, int_str ); return true; } case classad::Value::REAL_VALUE: { float f; if ( !m_full_ad->LookupFloat ( _name, f ) ) { return false; } const char* float_str = to_string<float> ( f,dec ).c_str(); _attribute = new Attribute ( Attribute::FLOAT_TYPE, float_str ); return true; } case classad::Value::STRING_VALUE: { std::string str; if ( !m_full_ad->LookupString ( _name, str ) ) { return false; } _attribute = new Attribute ( Attribute::STRING_TYPE, str.c_str() ); return true; } default: { ExprTree* tree = NULL; if ( ! ( tree = m_full_ad->Lookup ( _name ) ) ) { return false; } const char* rhs; rhs = ExprTreeToString( expr ); _attribute = new Attribute ( Attribute::EXPR_TYPE, rhs ); return true; } } return false; }
void doContactSchedd() { int rc; Qmgr_connection *schedd; BaseJob *curr_job; ClassAd *next_ad; char expr_buf[12000]; bool schedd_updates_complete = false; bool schedd_deletes_complete = false; bool add_remove_jobs_complete = false; bool update_jobs_complete = false; bool commit_transaction = true; int failure_line_num = 0; bool send_reschedule = false; std::string error_str = ""; StringList dirty_job_ids; char *job_id_str; PROC_ID job_id; CondorError errstack; dprintf(D_FULLDEBUG,"in doContactSchedd()\n"); initJobExprs(); contactScheddTid = TIMER_UNSET; // vacateJobs ///////////////////////////////////////////////////// if ( pendingScheddVacates.getNumElements() != 0 ) { std::string buff; StringList job_ids; VacateRequest curr_request; int result; ClassAd* rval; pendingScheddVacates.startIterations(); while ( pendingScheddVacates.iterate( curr_request ) != 0 ) { formatstr( buff, "%d.%d", curr_request.job->procID.cluster, curr_request.job->procID.proc ); job_ids.append( buff.c_str() ); } char *tmp = job_ids.print_to_string(); if ( tmp ) { dprintf( D_FULLDEBUG, "Calling vacateJobs on %s\n", tmp ); free(tmp); tmp = NULL; } rval = ScheddObj->vacateJobs( &job_ids, VACATE_FAST, &errstack ); if ( rval == NULL ) { formatstr( error_str, "vacateJobs returned NULL, CondorError: %s!", errstack.getFullText().c_str() ); goto contact_schedd_failure; } else { pendingScheddVacates.startIterations(); while ( pendingScheddVacates.iterate( curr_request ) != 0 ) { formatstr( buff, "job_%d_%d", curr_request.job->procID.cluster, curr_request.job->procID.proc ); if ( !rval->LookupInteger( buff.c_str(), result ) ) { dprintf( D_FULLDEBUG, "vacateJobs returned malformed ad\n" ); EXCEPT( "vacateJobs returned malformed ad" ); } else { dprintf( D_FULLDEBUG, " %d.%d vacate result: %d\n", curr_request.job->procID.cluster, curr_request.job->procID.proc,result); pendingScheddVacates.remove( curr_request.job->procID ); curr_request.result = (action_result_t)result; curr_request.job->SetEvaluateState(); completedScheddVacates.insert( curr_request.job->procID, curr_request ); } } delete rval; } } schedd = ConnectQ( ScheddAddr, QMGMT_TIMEOUT, false, NULL, myUserName, CondorVersion() ); if ( !schedd ) { error_str = "Failed to connect to schedd!"; goto contact_schedd_failure; } // CheckLeases ///////////////////////////////////////////////////// if ( checkLeasesSignaled ) { dprintf( D_FULLDEBUG, "querying for renewed leases\n" ); // Grab the lease attributes of all the jobs in our global hashtable. BaseJob::JobsByProcId.startIterations(); while ( BaseJob::JobsByProcId.iterate( curr_job ) != 0 ) { int new_expiration; rc = GetAttributeInt( curr_job->procID.cluster, curr_job->procID.proc, ATTR_TIMER_REMOVE_CHECK, &new_expiration ); if ( rc < 0 ) { if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; commit_transaction = false; goto contact_schedd_disconnect; } else { // This job doesn't have doesn't have a lease from // the submitter. Skip it. continue; } } curr_job->UpdateJobLeaseReceived( new_expiration ); } checkLeasesSignaled = false; } // end of handling check leases // AddJobs ///////////////////////////////////////////////////// if ( addJobsSignaled || firstScheddContact ) { int num_ads = 0; dprintf( D_FULLDEBUG, "querying for new jobs\n" ); // Make sure we grab all Globus Universe jobs (except held ones // that we previously indicated we were done with) // when we first start up in case we're recovering from a // shutdown/meltdown. // Otherwise, grab all jobs that are unheld and aren't marked as // currently being managed and aren't marked as not matched. // If JobManaged is undefined, equate it with false. // If Matched is undefined, equate it with true. // NOTE: Schedds from Condor 6.6 and earlier don't include // "(Universe==9)" in the constraint they give to the gridmanager, // so this gridmanager will pull down non-globus-universe ads, // although it won't use them. This is inefficient but not // incorrect behavior. if ( firstScheddContact ) { // Grab all jobs for us to manage. This expression is a // derivative of the expression below for new jobs. We add // "|| Managed =?= TRUE" to also get jobs our previous // incarnation was in the middle of managing when it died // (if it died unexpectedly). With the new term, the // "&& Managed =!= TRUE" from the new jobs expression becomes // superfluous (by boolean logic), so we drop it. sprintf( expr_buf, "%s && %s && ((%s && %s) || %s)", expr_schedd_job_constraint.c_str(), expr_not_completely_done.c_str(), expr_matched_or_undef.c_str(), expr_not_held.c_str(), expr_managed.c_str() ); } else { // Grab new jobs for us to manage sprintf( expr_buf, "%s && %s && %s && %s && %s", expr_schedd_job_constraint.c_str(), expr_not_completely_done.c_str(), expr_matched_or_undef.c_str(), expr_not_held.c_str(), expr_not_managed.c_str() ); } dprintf( D_FULLDEBUG,"Using constraint %s\n",expr_buf); next_ad = GetNextJobByConstraint( expr_buf, 1 ); while ( next_ad != NULL ) { PROC_ID procID; BaseJob *old_job; int job_is_matched = 1; // default to true if not in ClassAd next_ad->LookupInteger( ATTR_CLUSTER_ID, procID.cluster ); next_ad->LookupInteger( ATTR_PROC_ID, procID.proc ); bool job_is_managed = jobExternallyManaged(next_ad); next_ad->LookupBool(ATTR_JOB_MATCHED,job_is_matched); if ( BaseJob::JobsByProcId.lookup( procID, old_job ) != 0 ) { JobType *job_type = NULL; BaseJob *new_job = NULL; // job had better be either managed or matched! (or both) ASSERT( job_is_managed || job_is_matched ); if ( MustExpandJobAd( next_ad ) ) { // Get the expanded ClassAd from the schedd, which // has the GridResource filled in with info from // the matched ad. delete next_ad; next_ad = NULL; next_ad = GetJobAd(procID.cluster,procID.proc); if ( next_ad == NULL && errno == ETIMEDOUT ) { failure_line_num = __LINE__; commit_transaction = false; goto contact_schedd_disconnect; } if ( next_ad == NULL ) { // We may get here if it was not possible to expand // one of the $$() expressions. We don't want to // roll back the transaction and blow away the // hold that the schedd just put on the job, so // simply skip over this ad. dprintf(D_ALWAYS,"Failed to get expanded job ClassAd from Schedd for %d.%d. errno=%d\n",procID.cluster,procID.proc,errno); goto contact_schedd_next_add_job; } } // Search our job types for one that'll handle this job jobTypes.Rewind(); while ( jobTypes.Next( job_type ) ) { if ( job_type->AdMatchFunc( next_ad ) ) { // Found one! dprintf( D_FULLDEBUG, "Using job type %s for job %d.%d\n", job_type->Name, procID.cluster, procID.proc ); break; } } if ( job_type != NULL ) { new_job = job_type->CreateFunc( next_ad ); } else { dprintf( D_ALWAYS, "No handlers for job %d.%d\n", procID.cluster, procID.proc ); new_job = new BaseJob( next_ad ); } ASSERT(new_job); new_job->SetEvaluateState(); dprintf(D_ALWAYS,"Found job %d.%d --- inserting\n", new_job->procID.cluster,new_job->procID.proc); num_ads++; if ( !job_is_managed ) { rc = tSetAttributeString( new_job->procID.cluster, new_job->procID.proc, ATTR_JOB_MANAGED, MANAGED_EXTERNAL); if ( rc < 0 ) { failure_line_num = __LINE__; commit_transaction = false; goto contact_schedd_disconnect; } } } else { // We already know about this job, skip // But also set Managed=true on the schedd so that it won't // keep signalling us about it delete next_ad; rc = tSetAttributeString( procID.cluster, procID.proc, ATTR_JOB_MANAGED, MANAGED_EXTERNAL ); if ( rc < 0 ) { failure_line_num = __LINE__; commit_transaction = false; goto contact_schedd_disconnect; } } contact_schedd_next_add_job: next_ad = GetNextJobByConstraint( expr_buf, 0 ); } // end of while next_ad if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; commit_transaction = false; goto contact_schedd_disconnect; } dprintf(D_FULLDEBUG,"Fetched %d new job ads from schedd\n",num_ads); } // end of handling add jobs // RemoveJobs ///////////////////////////////////////////////////// // We always want to perform this check. Otherwise, we may overwrite a // REMOVED/HELD/COMPLETED status with something else below. { int num_ads = 0; dprintf( D_FULLDEBUG, "querying for removed/held jobs\n" ); // Grab jobs marked as REMOVED/COMPLETED or marked as HELD that we // haven't previously indicated that we're done with (by setting // JobManaged to "Schedd". sprintf( expr_buf, "(%s) && (%s) && (%s == %d || %s == %d || (%s == %d && %s =?= \"%s\"))", ScheddJobConstraint, expr_not_completely_done.c_str(), ATTR_JOB_STATUS, REMOVED, ATTR_JOB_STATUS, COMPLETED, ATTR_JOB_STATUS, HELD, ATTR_JOB_MANAGED, MANAGED_EXTERNAL ); dprintf( D_FULLDEBUG,"Using constraint %s\n",expr_buf); next_ad = GetNextJobByConstraint( expr_buf, 1 ); while ( next_ad != NULL ) { PROC_ID procID; BaseJob *next_job; int curr_status; next_ad->LookupInteger( ATTR_CLUSTER_ID, procID.cluster ); next_ad->LookupInteger( ATTR_PROC_ID, procID.proc ); next_ad->LookupInteger( ATTR_JOB_STATUS, curr_status ); if ( BaseJob::JobsByProcId.lookup( procID, next_job ) == 0 ) { // Should probably skip jobs we already have marked as // held or removed next_job->JobAdUpdateFromSchedd( next_ad, true ); num_ads++; } else if ( curr_status == REMOVED ) { // If we don't know about the job, act like we got an // ADD_JOBS signal from the schedd the next time we // connect, so that we'll create a Job object for it // and decide how it needs to be handled. // TODO The AddJobs and RemoveJobs queries shoule be // combined into a single query. dprintf( D_ALWAYS, "Don't know about removed job %d.%d. " "Will treat it as a new job to manage\n", procID.cluster, procID.proc ); addJobsSignaled = true; } else { dprintf( D_ALWAYS, "Don't know about held/completed job %d.%d. " "Ignoring it\n", procID.cluster, procID.proc ); } delete next_ad; next_ad = GetNextJobByConstraint( expr_buf, 0 ); } if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; commit_transaction = false; goto contact_schedd_disconnect; } dprintf(D_FULLDEBUG,"Fetched %d job ads from schedd\n",num_ads); } if ( RemoteCommitTransaction() < 0 ) { failure_line_num = __LINE__; commit_transaction = false; goto contact_schedd_disconnect; } add_remove_jobs_complete = true; // Retrieve dirty attributes ///////////////////////////////////////////////////// if ( updateJobsSignaled ) { dprintf( D_FULLDEBUG, "querying for jobs with attribute updates\n" ); sprintf( expr_buf, "%s && %s && %s && %s", expr_schedd_job_constraint.c_str(), expr_not_completely_done.c_str(), expr_not_held.c_str(), expr_managed.c_str() ); dprintf( D_FULLDEBUG,"Using constraint %s\n",expr_buf); next_ad = GetNextDirtyJobByConstraint( expr_buf, 1 ); while ( next_ad != NULL ) { ClassAd updates; char str[PROC_ID_STR_BUFLEN]; next_ad->LookupInteger( ATTR_CLUSTER_ID, job_id.cluster ); next_ad->LookupInteger( ATTR_PROC_ID, job_id.proc ); if ( GetDirtyAttributes( job_id.cluster, job_id.proc, &updates ) < 0 ) { dprintf( D_ALWAYS, "Failed to retrieve dirty attributes for job %d.%d\n", job_id.cluster, job_id.proc ); failure_line_num = __LINE__; delete next_ad; goto contact_schedd_disconnect; } else { dprintf (D_FULLDEBUG, "Retrieved updated attributes for job %d.%d\n", job_id.cluster, job_id.proc); dPrintAd(D_JOB, updates); } if ( BaseJob::JobsByProcId.lookup( job_id, curr_job ) == 0 ) { curr_job->JobAdUpdateFromSchedd( &updates, false ); ProcIdToStr( job_id, str ); dirty_job_ids.append( str ); } else { dprintf( D_ALWAYS, "Don't know about updated job %d.%d. " "Ignoring it\n", job_id.cluster, job_id.proc ); } delete next_ad; next_ad = GetNextDirtyJobByConstraint( expr_buf, 0 ); } } update_jobs_complete = true; // if ( BeginTransaction() < 0 ) { errno = 0; BeginTransaction(); if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; commit_transaction = false; goto contact_schedd_disconnect; } // requestJobStatus ///////////////////////////////////////////////////// if ( pendingJobStatus.getNumElements() != 0 ) { JobStatusRequest curr_request; pendingJobStatus.startIterations(); while ( pendingJobStatus.iterate( curr_request ) != 0 ) { int status; rc = GetAttributeInt( curr_request.job_id.cluster, curr_request.job_id.proc, ATTR_JOB_STATUS, &status ); if ( rc < 0 ) { if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; commit_transaction = false; goto contact_schedd_disconnect; } else { // The job is not in the schedd's job queue. This // probably means that the user did a condor_rm -f, // so return a job status of REMOVED. status = REMOVED; } } // return status dprintf( D_FULLDEBUG, "%d.%d job status: %d\n", curr_request.job_id.cluster, curr_request.job_id.proc, status ); pendingJobStatus.remove( curr_request.job_id ); curr_request.job_status = status; daemonCore->Reset_Timer( curr_request.tid, 0 ); completedJobStatus.insert( curr_request.job_id, curr_request ); } } // Update existing jobs ///////////////////////////////////////////////////// ScheddUpdateRequest *curr_request; pendingScheddUpdates.startIterations(); while ( pendingScheddUpdates.iterate( curr_request ) != 0 ) { curr_job = curr_request->m_job; dprintf(D_FULLDEBUG,"Updating classad values for %d.%d:\n", curr_job->procID.cluster, curr_job->procID.proc); const char *attr_name; const char *attr_value; ExprTree *expr; bool fake_job_in_queue = false; curr_job->jobAd->ResetExpr(); while ( curr_job->jobAd->NextDirtyExpr(attr_name, expr) == true && fake_job_in_queue == false ) { attr_value = ExprTreeToString( expr ); dprintf(D_FULLDEBUG," %s = %s\n",attr_name,attr_value); rc = SetAttribute( curr_job->procID.cluster, curr_job->procID.proc, attr_name, attr_value); if ( rc < 0 ) { if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; commit_transaction = false; goto contact_schedd_disconnect; } else { // The job is not in the schedd's job queue. This // probably means that the user did a condor_rm -f, // so pretend that all updates for the job succeed. // Otherwise, we'll never make forward progress on // the job. // TODO We should also fake a job status of REMOVED // to the job, so it can do what cleanup it can. fake_job_in_queue = true; break; } } } } if ( RemoteCommitTransaction() < 0 ) { failure_line_num = __LINE__; commit_transaction = false; goto contact_schedd_disconnect; } schedd_updates_complete = true; // Delete existing jobs ///////////////////////////////////////////////////// errno = 0; BeginTransaction(); if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; commit_transaction = false; goto contact_schedd_disconnect; } pendingScheddUpdates.startIterations(); while ( pendingScheddUpdates.iterate( curr_request ) != 0 ) { curr_job = curr_request->m_job; if ( curr_job->deleteFromSchedd ) { dprintf(D_FULLDEBUG,"Deleting job %d.%d from schedd\n", curr_job->procID.cluster, curr_job->procID.proc); rc = DestroyProc(curr_job->procID.cluster, curr_job->procID.proc); // NOENT means the job doesn't exist. Good enough for us. if ( rc < 0 && rc != DESTROYPROC_ENOENT) { failure_line_num = __LINE__; commit_transaction = false; goto contact_schedd_disconnect; } } } if ( RemoteCommitTransaction() < 0 ) { failure_line_num = __LINE__; commit_transaction = false; goto contact_schedd_disconnect; } schedd_deletes_complete = true; contact_schedd_disconnect: DisconnectQ( schedd, commit_transaction ); if ( add_remove_jobs_complete == true ) { firstScheddContact = false; addJobsSignaled = false; } else { formatstr( error_str, "Schedd connection error during Add/RemoveJobs at line %d!", failure_line_num ); goto contact_schedd_failure; } if ( update_jobs_complete == true ) { updateJobsSignaled = false; } else { formatstr( error_str, "Schedd connection error during dirty attribute update at line %d!", failure_line_num ); goto contact_schedd_failure; } if ( schedd_updates_complete == false ) { formatstr( error_str, "Schedd connection error during updates at line %d!", failure_line_num ); goto contact_schedd_failure; } // Clear dirty bits for all jobs updated if ( !dirty_job_ids.isEmpty() ) { ClassAd *rval; dprintf( D_FULLDEBUG, "Calling clearDirtyAttrs on %d jobs\n", dirty_job_ids.number() ); dirty_job_ids.rewind(); rval = ScheddObj->clearDirtyAttrs( &dirty_job_ids, &errstack ); if ( rval == NULL ) { dprintf(D_ALWAYS, "Failed to notify schedd to clear dirty attributes. CondorError: %s\n", errstack.getFullText().c_str() ); } delete rval; } // Wake up jobs that had schedd updates pending and delete job // objects that wanted to be deleted pendingScheddUpdates.startIterations(); while ( pendingScheddUpdates.iterate( curr_request ) != 0 ) { curr_job = curr_request->m_job; curr_job->jobAd->ClearAllDirtyFlags(); if ( curr_job->deleteFromGridmanager ) { // If the Job object wants to delete the job from the // schedd but we failed to do so, don't delete the job // object yet; wait until we successfully delete the job // from the schedd. if ( curr_job->deleteFromSchedd == true && schedd_deletes_complete == false ) { continue; } // If wantRematch is set, send a reschedule now if ( curr_job->wantRematch ) { send_reschedule = true; } pendingScheddUpdates.remove( curr_job->procID ); pendingScheddVacates.remove( curr_job->procID ); pendingJobStatus.remove( curr_job->procID ); completedJobStatus.remove( curr_job->procID ); completedScheddVacates.remove( curr_job->procID ); delete curr_job; } else { pendingScheddUpdates.remove( curr_job->procID ); if ( curr_request->m_notify ) { curr_job->SetEvaluateState(); } } delete curr_request; } // Poke objects that wanted to be notified when a schedd update completed // successfully (possibly minus deletes) int timer_id; scheddUpdateNotifications.Rewind(); while ( scheddUpdateNotifications.Next( timer_id ) ) { daemonCore->Reset_Timer( timer_id, 0 ); } scheddUpdateNotifications.Clear(); if ( send_reschedule == true ) { ScheddObj->reschedule(); } // Check if we have any jobs left to manage. If not, exit. if ( BaseJob::JobsByProcId.getNumElements() == 0 ) { dprintf( D_ALWAYS, "No jobs left, shutting down\n" ); daemonCore->Send_Signal( daemonCore->getpid(), SIGTERM ); } lastContactSchedd = time(NULL); if ( schedd_deletes_complete == false ) { error_str = "Problem using DestroyProc to delete jobs!"; goto contact_schedd_failure; } scheddFailureCount = 0; // For each job that had dirty attributes, re-evaluate the policy dirty_job_ids.rewind(); while ( (job_id_str = dirty_job_ids.next()) != NULL ) { StrToProcIdFixMe(job_id_str, job_id); if ( BaseJob::JobsByProcId.lookup( job_id, curr_job ) == 0 ) { curr_job->EvalPeriodicJobExpr(); } } dprintf(D_FULLDEBUG,"leaving doContactSchedd()\n"); return; contact_schedd_failure: scheddFailureCount++; if ( error_str == "" ) { error_str = "Failure in doContactSchedd"; } if ( scheddFailureCount >= maxScheddFailures ) { dprintf( D_ALWAYS, "%s\n", error_str.c_str() ); EXCEPT( "Too many failures connecting to schedd!" ); } dprintf( D_ALWAYS, "%s Will retry\n", error_str.c_str() ); lastContactSchedd = time(NULL); RequestContactSchedd(); return; }
/* Fill in a PROC structure given a job ClassAd. This function replaces GetProc from qmgr_lib_support.C, so we just get the job classad once from the schedd and fill in the PROC structure directly. */ int MakeProc(ClassAd *ad, PROC *p) { char buf[ATTRLIST_MAX_EXPRESSION]; float utime,stime; ExprTree *e; p->version_num = 3; ad->LookupInteger(ATTR_CLUSTER_ID, p->id.cluster); ad->LookupInteger(ATTR_PROC_ID, p->id.proc); ad->LookupInteger(ATTR_JOB_UNIVERSE, p->universe); ad->LookupInteger(ATTR_WANT_CHECKPOINT, p->checkpoint); ad->LookupInteger(ATTR_WANT_REMOTE_SYSCALLS, p->remote_syscalls); ad->LookupString(ATTR_OWNER, buf); p->owner = strdup(buf); ad->LookupInteger(ATTR_Q_DATE, p->q_date); ad->LookupInteger(ATTR_COMPLETION_DATE, p->completion_date); ad->LookupInteger(ATTR_JOB_STATUS, p->status); ad->LookupInteger(ATTR_JOB_PRIO, p->prio); ad->LookupInteger(ATTR_JOB_NOTIFICATION, p->notification); ad->LookupInteger(ATTR_IMAGE_SIZE, p->image_size); // There are two different syntaxes for the environment. Since // the wire protocol only expects one, we pack either one into the // same proc variable "env_v1or2" and make sure they are // distinguishable. For backward compatibility, the schedd // will have already ensured that we use V1 syntax if the // remote side only understands that. Env envobj; MyString env_v1or2; MyString env_error_msg; if(!envobj.getDelimitedStringV1or2Raw(ad,&env_v1or2,&env_error_msg)) { EXCEPT("Failed to parse environment string: %s\n", env_error_msg.Value()); } p->env_v1or2 = strdup(env_v1or2.Value()); p->n_cmds = 1; p->cmd = (char **) malloc(p->n_cmds * sizeof(char *)); p->args_v1or2 = (char **) malloc(p->n_cmds * sizeof(char *)); p->in = (char **) malloc(p->n_cmds * sizeof(char *)); p->out = (char **) malloc(p->n_cmds * sizeof(char *)); p->err = (char **) malloc(p->n_cmds * sizeof(char *)); p->exit_status = (int *) malloc(p->n_cmds * sizeof(int)); // There are two different syntaxes for arguments. Since // the wire protocol only expects one, we pack either one into the // same proc variable "env_v1or2" and make sure they are // distinguishable. For backward compatibility, the schedd // will have already ensured that we use V1 syntax if the // remote side only understands that. ArgList args; MyString args_v1or2; MyString error_msg; if(!args.GetArgsStringV1or2Raw(ad,&args_v1or2,&error_msg)) { EXCEPT("Failed to get V1or2 arguments string: %s\n",error_msg.Value()); } p->args_v1or2[0] = strdup(args_v1or2.Value()); ad->LookupString(ATTR_JOB_CMD, buf); p->cmd[0] = strdup(buf); ad->LookupString(ATTR_JOB_INPUT, buf); p->in[0] = strdup(buf); ad->LookupString(ATTR_JOB_OUTPUT, buf); p->out[0] = strdup(buf); ad->LookupString(ATTR_JOB_ERROR, buf); p->err[0] = strdup(buf); ad->LookupInteger(ATTR_JOB_EXIT_STATUS, p->exit_status[0]); ad->LookupInteger(ATTR_MIN_HOSTS, p->min_needed); ad->LookupInteger(ATTR_MAX_HOSTS, p->max_needed); ad->LookupString(ATTR_JOB_ROOT_DIR, buf); p->rootdir = strdup(buf); ad->LookupString(ATTR_JOB_IWD, buf); p->iwd = strdup(buf); e = ad->LookupExpr(ATTR_REQUIREMENTS); if (e) { p->requirements = strdup(ExprTreeToString(e)); } else { p->requirements = NULL; } e = ad->LookupExpr(ATTR_RANK); if (e) { p->preferences = strdup(ExprTreeToString(e)); } else { p->preferences = NULL; } ad->LookupFloat(ATTR_JOB_LOCAL_USER_CPU, utime); ad->LookupFloat(ATTR_JOB_LOCAL_SYS_CPU, stime); float_to_rusage(utime, stime, &(p->local_usage)); p->remote_usage = (struct rusage *) malloc(p->n_cmds * sizeof(struct rusage)); memset(p->remote_usage, 0, sizeof( struct rusage )); ad->LookupFloat(ATTR_JOB_REMOTE_USER_CPU, utime); ad->LookupFloat(ATTR_JOB_REMOTE_SYS_CPU, stime); float_to_rusage(utime, stime, &(p->remote_usage[0])); return 0; }
bool VMGahpServer::publishVMClassAd(const char *workingdir) { static const char* command = "CLASSAD"; if( !m_job_ad || !workingdir ) { return false; } if( m_is_initialized == false ) { return false; } if( isSupportedCommand(command) == FALSE ) { return false; } // Before sending command, print the remaining stderr messages from vmgahp printSystemErrorMsg(); if(write_line(command) == false) { return false; } // give some time to gahp server sleep(1); Gahp_Args start_result; if( read_argv(start_result) == false ) { return false; } if( start_result.argc == 0 || start_result.argv[0][0] != 'S' ) { dprintf(D_ALWAYS,"VMGAHP command '%s' failed\n",command); return false; } // Send Working directory MyString vmAttr; vmAttr.sprintf("VM_WORKING_DIR = \"%s\"", workingdir); if ( write_line( vmAttr.Value() ) == false ) { return false; } // publish job ClassAd to vmgahp server via pipe bool can_send_it = false; int total_len = 0; const char *name; ExprTree *expr = NULL; m_job_ad->ResetExpr(); while( m_job_ad->NextExpr(name, expr) ) { can_send_it = false; if( !m_send_all_classad ) { // Instead of sending entire job ClassAd to vmgahp, // we will send some part of job ClassAd necessary to vmgahp. if( !strncasecmp( name, "JobVM", strlen("JobVM") ) || !strncasecmp( name, "VMPARAM", strlen("VMPARAM") ) || !strncasecmp( name, ATTR_CLUSTER_ID, strlen(ATTR_CLUSTER_ID)) || !strncasecmp( name, ATTR_PROC_ID, strlen(ATTR_PROC_ID)) || !strncasecmp( name, ATTR_USER, strlen(ATTR_USER)) || !strncasecmp( name, ATTR_ORIG_JOB_IWD, strlen(ATTR_ORIG_JOB_IWD)) || !strncasecmp( name, ATTR_JOB_ARGUMENTS1, strlen(ATTR_JOB_ARGUMENTS1)) || !strncasecmp( name, ATTR_JOB_ARGUMENTS2, strlen(ATTR_JOB_ARGUMENTS2)) || !strncasecmp( name, ATTR_TRANSFER_INTERMEDIATE_FILES, strlen(ATTR_TRANSFER_INTERMEDIATE_FILES)) || !strncasecmp( name, ATTR_TRANSFER_INPUT_FILES, strlen(ATTR_TRANSFER_INPUT_FILES)) ) { can_send_it = true; } } else { // We will send the entire job ClassAd to vmgahp can_send_it = true; } if( !can_send_it ) { continue; } vmAttr.sprintf( "%s = %s", name, ExprTreeToString( expr ) ); if ( write_line( vmAttr.Value() ) == false ) { return false; } total_len += vmAttr.Length(); if( total_len > 2048 ) { // Give some time for vmgahp to read this pipe sleep(1); printSystemErrorMsg(); total_len = 0; } } static const char *endcommand = "CLASSAD_END"; if(write_line(endcommand) == false) { return false; } // give some time to vmgahp sleep(1); Gahp_Args end_result; if( read_argv(end_result) == false ) { return false; } if( end_result.argc == 0 || end_result.argv[0][0] != 'S' ) { dprintf(D_ALWAYS,"VMGAHP command '%s' failed\n",endcommand); return false; } m_workingdir = workingdir; return true; }
bool UserPolicy::FiringReason(MyString &reason,int &reason_code,int &reason_subcode) { reason_code = 0; reason_subcode = 0; if ( m_ad == NULL || m_fire_expr == NULL ) { return false; } const char * expr_src; MyString exprString; std::string reason_expr_param; std::string reason_expr_attr; std::string subcode_expr_param; std::string subcode_expr_attr; switch(m_fire_source) { case FS_NotYet: expr_src = "UNKNOWN (never set)"; break; case FS_JobAttribute: { expr_src = "job attribute"; ExprTree *tree; tree = m_ad->LookupExpr( m_fire_expr ); // Get a formatted expression string if( tree ) { exprString = ExprTreeToString( tree ); } if( m_fire_expr_val == -1 ) { reason_code = CONDOR_HOLD_CODE_JobPolicyUndefined; } else { reason_code = CONDOR_HOLD_CODE_JobPolicy; sprintf(reason_expr_attr,"%sReason", m_fire_expr); sprintf(subcode_expr_attr,"%sSubCode", m_fire_expr); } break; } case FS_SystemMacro: { expr_src = "system macro"; char * val = param(m_fire_expr); exprString = val; free(val); if( m_fire_expr_val == -1 ) { reason_code = CONDOR_HOLD_CODE_SystemPolicyUndefined; } else { reason_code = CONDOR_HOLD_CODE_SystemPolicy; sprintf(reason_expr_param,"%s_REASON", m_fire_expr); sprintf(subcode_expr_param,"%s_SUBCODE", m_fire_expr); } break; } default: expr_src = "UNKNOWN (bad value)"; break; } reason = ""; MyString subcode_expr; if( !subcode_expr_param.empty() && param(subcode_expr,subcode_expr_param.c_str(),NULL) && !subcode_expr.IsEmpty()) { m_ad->AssignExpr(ATTR_SCRATCH_EXPRESSION, subcode_expr.Value()); m_ad->EvalInteger(ATTR_SCRATCH_EXPRESSION, m_ad, reason_subcode); m_ad->Delete(ATTR_SCRATCH_EXPRESSION); } else if( !subcode_expr_attr.empty() ) { m_ad->EvalInteger(subcode_expr_attr.c_str(), m_ad, reason_subcode); } MyString reason_expr; if( !reason_expr_param.empty() && param(reason_expr,reason_expr_param.c_str(),NULL) && !reason_expr.IsEmpty()) { m_ad->AssignExpr(ATTR_SCRATCH_EXPRESSION, reason_expr.Value()); m_ad->EvalString(ATTR_SCRATCH_EXPRESSION, m_ad, reason); m_ad->Delete(ATTR_SCRATCH_EXPRESSION); } else if( !reason_expr_attr.empty() ) { m_ad->EvalString(reason_expr_attr.c_str(), m_ad, reason); } if( !reason.IsEmpty() ) { return true; } // Format up the reason string reason.sprintf( "The %s %s expression '%s' evaluated to ", expr_src, m_fire_expr, exprString.Value()); // Get a string for it's value switch( m_fire_expr_val ) { case 0: reason += "FALSE"; break; case 1: reason += "TRUE"; break; case -1: reason += "UNDEFINED"; break; default: EXCEPT( "Unrecognized FiringExpressionValue: %d", m_fire_expr_val ); break; } return true; }
int CondorQ::fetchQueueFromDBAndProcess ( const char *dbconn, char *&lastUpdate, condor_q_process_func process_func, void * process_func_data, CondorError* /*errstack*/ ) { #ifndef HAVE_EXT_POSTGRESQL (void) dbconn; (void) lastUpdate; (void) process_func; (void) process_func_data; #else int result; JobQueueSnapshot *jqSnapshot; const char *constraint; ClassAd *ad; QuillErrCode rv; ExprTree *tree; ASSERT(process_func); jqSnapshot = new JobQueueSnapshot(dbconn); rv = jqSnapshot->startIterateAllClassAds(clusterarray, numclusters, procarray, numprocs, schedd, FALSE, scheddBirthdate, lastUpdate); if (rv == QUILL_FAILURE) { delete jqSnapshot; return Q_COMMUNICATION_ERROR; } else if (rv == JOB_QUEUE_EMPTY) { delete jqSnapshot; return Q_OK; } // make the query ad if ((result = query.makeQuery (tree)) != Q_OK) { delete jqSnapshot; return result; } constraint = ExprTreeToString(tree); delete tree; ad = getDBNextJobByConstraint(constraint, jqSnapshot); while (ad != (ClassAd *) 0) { // Process the data and insert it into the list if ((*process_func) (ad, process_func_data) ) { ad->Clear(); delete ad; } ad = getDBNextJobByConstraint(constraint, jqSnapshot); } delete jqSnapshot; #endif /* HAVE_EXT_POSTGRESQL */ return Q_OK; }
void doContactSchedd() { if (command_queue.IsEmpty()) { daemonCore->Reset_Timer( contactScheddTid, contact_schedd_interval ); // Come back in a min return; } dprintf(D_FULLDEBUG,"in doContactSchedd\n"); SchedDRequest * current_command = NULL; int error=FALSE; std::string error_msg; CondorError errstack; bool do_reschedule = false; int failure_line_num = 0; int failure_errno = 0; // Try connecting to schedd DCSchedd dc_schedd ( ScheddAddr, ScheddPool ); if (dc_schedd.error() || !dc_schedd.locate()) { sprintf( error_msg, "Error locating schedd %s", ScheddAddr ); dprintf( D_ALWAYS, "%s\n", error_msg.c_str() ); // If you can't connect return "Failure" on every job request command_queue.Rewind(); while (command_queue.Next(current_command)) { if (current_command->status != SchedDRequest::SDCS_NEW) continue; if (current_command->command == SchedDRequest::SDC_STATUS_CONSTRAINED) { const char * result[] = { GAHP_RESULT_FAILURE, error_msg.c_str(), "0"}; enqueue_result (current_command->request_id, result, 3); } else if (current_command->command == SchedDRequest::SDC_SUBMIT_JOB) { const char * result[] = { GAHP_RESULT_FAILURE, NULL, error_msg.c_str() }; enqueue_result (current_command->request_id, result, 3); } else if (current_command->command == SchedDRequest::SDC_UPDATE_LEASE) { const char * result[] = { GAHP_RESULT_FAILURE, error_msg.c_str(), NULL }; enqueue_result (current_command->request_id, result, 3); } else { const char * result[] = { GAHP_RESULT_FAILURE, error_msg.c_str() }; enqueue_result (current_command->request_id, result, 2); } current_command->status = SchedDRequest::SDCS_COMPLETED; } } SchedDRequest::schedd_command_type commands [] = { SchedDRequest::SDC_REMOVE_JOB, SchedDRequest::SDC_HOLD_JOB, SchedDRequest::SDC_RELEASE_JOB }; const char * command_titles [] = { "REMOVE_JOB", "HOLD_JOB", "RELEASE_JOB" }; // REMOVE // HOLD // RELEASE int i=0; while (i<3) { StringList id_list; SimpleList <SchedDRequest*> this_batch; SchedDRequest::schedd_command_type this_command = commands[i]; const char * this_action = command_titles[i]; const char * this_reason = NULL; dprintf (D_FULLDEBUG, "Processing %s requests\n", this_action); error = FALSE; // Create a batch of commands with the same command type AND the same reason command_queue.Rewind(); while (command_queue.Next(current_command)) { if (current_command->status != SchedDRequest::SDCS_NEW) continue; if (current_command->command != this_command) continue; if ((this_reason != NULL) && (strcmp (current_command->reason, this_reason) != 0)) continue; if (this_reason == NULL) this_reason = current_command->reason; char job_id_buff[30]; sprintf (job_id_buff, "%d.%d", current_command->cluster_id, current_command->proc_id); id_list.append (job_id_buff); this_batch.Append (current_command); } // If we haven't found any.... if (id_list.isEmpty()) { i++; continue; // ... then try the next command } // Perform the appropriate command on the current batch ClassAd * result_ad= NULL; if (this_command == SchedDRequest::SDC_REMOVE_JOB) { errstack.clear(); result_ad= dc_schedd.removeJobs ( &id_list, this_reason, &errstack); } else if (this_command == SchedDRequest::SDC_HOLD_JOB) { errstack.clear(); result_ad= dc_schedd.holdJobs ( &id_list, this_reason, NULL, &errstack); } else if (this_command == SchedDRequest::SDC_RELEASE_JOB) { errstack.clear(); result_ad= dc_schedd.releaseJobs ( &id_list, this_reason, &errstack); } else { EXCEPT( "Unexpected command type %d in doContactSchedd", this_command ); } // Analyze the result ad if (!result_ad) { error = TRUE; sprintf( error_msg, "Error connecting to schedd %s %s: %s", ScheddAddr, dc_schedd.addr(), errstack.getFullText() ); } else { result_ad->dPrint (D_FULLDEBUG); if ( this_command == SchedDRequest::SDC_RELEASE_JOB ) { do_reschedule = true; } } // Go through the batch again, and create responses for each request this_batch.Rewind(); while (this_batch.Next(current_command)) { // Check the result char job_id_buff[30]; if (result_ad && (error == FALSE)) { sprintf (job_id_buff, "job_%d_%d", current_command->cluster_id, current_command->proc_id); int remove_result; if (result_ad->LookupInteger (job_id_buff, remove_result)) { switch (remove_result) { case AR_ERROR: error = TRUE; error_msg = "General Error"; break; case AR_SUCCESS: error = FALSE; break; case AR_NOT_FOUND: error = TRUE; error_msg = "Job not found"; break; case AR_BAD_STATUS: error = TRUE; error_msg = "Bad job status"; break; case AR_ALREADY_DONE: error = TRUE; error_msg = "Already done"; break; case AR_PERMISSION_DENIED: error = TRUE; error_msg = "Permission denied"; break; default: error = TRUE; error_msg = "Unknown Result"; } // hctiws } else { error_msg = "Unable to get result"; } // fi lookup result for job } // fi error == FALSE if (error) { dprintf (D_ALWAYS, "Error (operation: %s) %d.%d: %s\n", this_action, current_command->cluster_id, current_command->proc_id, error_msg.c_str()); const char * result[2]; result[0] = GAHP_RESULT_FAILURE; result[1] = error_msg.c_str(); enqueue_result (current_command->request_id, result, 2); } else { dprintf (D_ALWAYS, "Succeess (operation: %s) %d.%d\n", this_action, current_command->cluster_id, current_command->proc_id); const char * result[2]; result[0] = GAHP_RESULT_SUCCESS; result[1] = NULL; enqueue_result (current_command->request_id, result, 2); } // fi error // Mark the status current_command->status = SchedDRequest::SDCS_COMPLETED; } // elihw this_batch if ( result_ad ) { delete result_ad; } } dprintf (D_FULLDEBUG, "Processing JOB_STAGE_IN requests\n"); // JOB_STAGE_IN int MAX_BATCH_SIZE=1; // This should be a config param SimpleList <SchedDRequest*> stage_in_batch; do { stage_in_batch.Clear(); command_queue.Rewind(); while (command_queue.Next(current_command)) { if (current_command->status != SchedDRequest::SDCS_NEW) continue; if (current_command->command != SchedDRequest::SDC_JOB_STAGE_IN) continue; dprintf (D_ALWAYS, "Adding %d.%d to STAGE_IN batch\n", current_command->cluster_id, current_command->proc_id); stage_in_batch.Append (current_command); if (stage_in_batch.Number() >= MAX_BATCH_SIZE) break; } if (stage_in_batch.Number() > 0) { ClassAd ** array = new ClassAd*[stage_in_batch.Number()]; i=0; stage_in_batch.Rewind(); while (stage_in_batch.Next(current_command)) { array[i++] = current_command->classad; } error = FALSE; errstack.clear(); if (!dc_schedd.spoolJobFiles( stage_in_batch.Number(), array, &errstack )) { error = TRUE; sprintf( error_msg, "Error sending files to schedd %s: %s", ScheddAddr, errstack.getFullText() ); dprintf( D_ALWAYS, "%s\n", error_msg.c_str() ); } delete [] array; stage_in_batch.Rewind(); while (stage_in_batch.Next(current_command)) { current_command->status = SchedDRequest::SDCS_COMPLETED; if (error) { const char * result[] = { GAHP_RESULT_FAILURE, error_msg.c_str() }; enqueue_result (current_command->request_id, result, 2); } else { const char * result[] = { GAHP_RESULT_SUCCESS, NULL }; enqueue_result (current_command->request_id, result, 2); } } // elihw (command_queue) } // fi has STAGE_IN requests } while (stage_in_batch.Number() > 0); dprintf (D_FULLDEBUG, "Processing JOB_STAGE_OUT requests\n"); // JOB_STAGE_OUT SimpleList <SchedDRequest*> stage_out_batch; command_queue.Rewind(); while (command_queue.Next(current_command)) { if (current_command->status != SchedDRequest::SDCS_NEW) continue; if (current_command->command != SchedDRequest::SDC_JOB_STAGE_OUT) continue; stage_out_batch.Append (current_command); } if (stage_out_batch.Number() > 0) { std::string constraint = ""; stage_out_batch.Rewind(); int jobsexpected = stage_out_batch.Number(); while (stage_out_batch.Next(current_command)) { sprintf_cat( constraint, "(ClusterId==%d&&ProcId==%d)||", current_command->cluster_id, current_command->proc_id ); } constraint += "False"; error = FALSE; errstack.clear(); int jobssent; if (!dc_schedd.receiveJobSandbox( constraint.c_str(), &errstack, &jobssent )) { error = TRUE; sprintf( error_msg, "Error receiving files from schedd %s: %s", ScheddAddr, errstack.getFullText() ); dprintf( D_ALWAYS, "%s\n", error_msg.c_str() ); } if(error == FALSE && jobssent != jobsexpected) { error = TRUE; sprintf( error_msg, "Schedd %s didn't send expected files", ScheddAddr ); dprintf (D_ALWAYS, "Transfered files for %d jobs but got files for %d jobs. (Schedd %s with contraint %s\n", jobsexpected, jobssent, ScheddAddr, constraint.c_str()); } stage_out_batch.Rewind(); while (stage_out_batch.Next(current_command)) { current_command->status = SchedDRequest::SDCS_COMPLETED; if (error) { const char * result[] = { GAHP_RESULT_FAILURE, error_msg.c_str() }; enqueue_result (current_command->request_id, result, 2); } else { const char * result[] = { GAHP_RESULT_SUCCESS, NULL }; enqueue_result (current_command->request_id, result, 2); } } // elihw (command_queue) } // fi has STAGE_OUT requests dprintf (D_FULLDEBUG, "Processing JOB_REFRESH_PROXY requests\n"); CondorVersionInfo ver_info(dc_schedd.version()); bool delegate_credential; if ( ver_info.built_since_version(6,7,19) && param_boolean( "DELEGATE_JOB_GSI_CREDENTIALS", true ) ) { delegate_credential = true; } else { delegate_credential = false; } // JOB_REFRESH_PROXY command_queue.Rewind(); while (command_queue.Next(current_command)) { if (current_command->status != SchedDRequest::SDCS_NEW) continue; if (current_command->command != SchedDRequest::SDC_JOB_REFRESH_PROXY) continue; time_t expiration_time = GetDesiredDelegatedJobCredentialExpiration(current_command->classad); time_t result_expiration_time = 0; bool result; errstack.clear(); if ( delegate_credential ) { result = dc_schedd.delegateGSIcredential( current_command->cluster_id, current_command->proc_id, current_command->proxy_file, expiration_time, &result_expiration_time, &errstack ); // Currently, we do not propagate the actual resulting // expiration time back to the gridmanager. We // probably should. } else { result = dc_schedd.updateGSIcredential( current_command->cluster_id, current_command->proc_id, current_command->proxy_file, &errstack ); } current_command->status = SchedDRequest::SDCS_COMPLETED; if (result == false) { sprintf( error_msg, "Error refreshing proxy to schedd %s: %s", ScheddAddr, errstack.getFullText() ); dprintf( D_ALWAYS, "%s\n", error_msg.c_str() ); const char * result_to_queue[] = { GAHP_RESULT_FAILURE, error_msg.c_str() }; enqueue_result (current_command->request_id, result_to_queue, 2); } else { const char * result_to_queue[] = { GAHP_RESULT_SUCCESS, NULL }; enqueue_result (current_command->request_id, result_to_queue, 2); } } // Now do all the QMGMT transactions error = FALSE; // Try connecting to the queue Qmgr_connection * qmgr_connection; if ((qmgr_connection = ConnectQ(dc_schedd.addr(), QMGMT_TIMEOUT, false, NULL, NULL, dc_schedd.version() )) == NULL) { error = TRUE; sprintf( error_msg, "Error connecting to schedd %s", ScheddAddr ); dprintf( D_ALWAYS, "%s\n", error_msg.c_str() ); } else { errno = 0; AbortTransaction(); // Just so we can call BeginTransaction() in the loop if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; failure_errno = errno; goto contact_schedd_disconnect; } } dprintf (D_FULLDEBUG, "Processing UPDATE_CONSTRAINED/UDATE_JOB requests\n"); // UPDATE_CONSTRAINED // UDATE_JOB command_queue.Rewind(); while (command_queue.Next(current_command)) { if (current_command->status != SchedDRequest::SDCS_NEW) continue; if ((current_command->command != SchedDRequest::SDC_UPDATE_CONSTRAINED) && (current_command->command != SchedDRequest::SDC_UPDATE_JOB)) continue; if (qmgr_connection == NULL) goto update_report_result; error = FALSE; errno = 0; BeginTransaction(); if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; failure_errno = errno; goto contact_schedd_disconnect; } current_command->classad->ResetExpr(); ExprTree *tree; const char *lhstr, *rhstr; while( current_command->classad->NextExpr(lhstr, tree) ) { rhstr = ExprTreeToString( tree ); if( !lhstr || !rhstr) { sprintf( error_msg, "ERROR: ClassAd problem in Updating by constraint %s", current_command->constraint ); dprintf( D_ALWAYS, "%s\n", error_msg.c_str() ); error = TRUE; } else { if (current_command->command == SchedDRequest::SDC_UPDATE_CONSTRAINED) { if( SetAttributeByConstraint(current_command->constraint, lhstr, rhstr) == -1 ) { if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; failure_errno = errno; goto contact_schedd_disconnect; } sprintf( error_msg, "ERROR: Failed (errno=%d) to SetAttributeByConstraint %s=%s for constraint %s", errno, lhstr, rhstr, current_command->constraint ); dprintf( D_ALWAYS, "%s\n", error_msg.c_str() ); error = TRUE; } } else if (current_command->command == SchedDRequest::SDC_UPDATE_JOB) { if( SetAttribute(current_command->cluster_id, current_command->proc_id, lhstr, rhstr) == -1 ) { if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; failure_errno = errno; goto contact_schedd_disconnect; } sprintf( error_msg, "ERROR: Failed to SetAttribute() %s=%s for job %d.%d", lhstr, rhstr, current_command->cluster_id, current_command->proc_id); dprintf( D_ALWAYS, "%s\n", error_msg.c_str() ); error = TRUE; } } } if (error) break; } // elihw classad update_report_result: if (error) { const char * result[] = { GAHP_RESULT_FAILURE, error_msg.c_str() }; //RemoteCommitTransaction(); enqueue_result (current_command->request_id, result, 2); current_command->status = SchedDRequest::SDCS_COMPLETED; if ( qmgr_connection != NULL ) { errno = 0; AbortTransaction(); if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; failure_errno = errno; goto contact_schedd_disconnect; } } } else { if ( RemoteCommitTransaction() < 0 ) { failure_line_num = __LINE__; failure_errno = errno; goto contact_schedd_disconnect; } const char * result[] = { GAHP_RESULT_SUCCESS, NULL }; enqueue_result (current_command->request_id, result, 2); current_command->status = SchedDRequest::SDCS_COMPLETED; } // fi } // elihw dprintf (D_FULLDEBUG, "Processing UPDATE_LEASE requests\n"); // UPDATE_LEASE command_queue.Rewind(); while (command_queue.Next(current_command)) { error = FALSE; if (current_command->status != SchedDRequest::SDCS_NEW) continue; if (current_command->command != SchedDRequest::SDC_UPDATE_LEASE) continue; std::string success_job_ids=""; if (qmgr_connection == NULL) { sprintf( error_msg, "Error connecting to schedd %s", ScheddAddr ); error = TRUE; } else { error = FALSE; errno = 0; BeginTransaction(); if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; failure_errno = errno; goto contact_schedd_disconnect; } for (i=0; i<current_command->num_jobs; i++) { time_t time_now = time(NULL); int duration = current_command->expirations[i].expiration - time_now; dprintf (D_FULLDEBUG, "Job %d.%d SetTimerAttribute=%d\n", current_command->expirations[i].cluster, current_command->expirations[i].proc, duration); if (SetTimerAttribute (current_command->expirations[i].cluster, current_command->expirations[i].proc, ATTR_TIMER_REMOVE_CHECK, duration) < 0) { if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; failure_errno = errno; goto contact_schedd_disconnect; } dprintf (D_ALWAYS, "Unable to SetTimerAttribute(%d, %d), errno=%d\n", current_command->expirations[i].cluster, current_command->expirations[i].proc, errno); } else { // Append job id to the result line if (success_job_ids.length() > 0) success_job_ids += ","; sprintf_cat( success_job_ids, "%d.%d", current_command->expirations[i].cluster, current_command->expirations[i].proc); } } //rof jobs for request } // fi error if (error) { const char * result[] = { GAHP_RESULT_FAILURE, error_msg.c_str(), NULL }; //RemoteCommitTransaction(); enqueue_result (current_command->request_id, result, 3); current_command->status = SchedDRequest::SDCS_COMPLETED; if ( qmgr_connection != NULL ) { errno = 0; AbortTransaction(); if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; failure_errno = errno; goto contact_schedd_disconnect; } } } else { if ( RemoteCommitTransaction() < 0 ) { failure_line_num = __LINE__; failure_errno = errno; goto contact_schedd_disconnect; } const char * result[] = { GAHP_RESULT_SUCCESS, NULL, success_job_ids.length()?success_job_ids.c_str():NULL }; enqueue_result (current_command->request_id, result, 3); current_command->status = SchedDRequest::SDCS_COMPLETED; } // fi } // elihw UPDATE_LEASE requests dprintf (D_FULLDEBUG, "Processing SUBMIT_JOB requests\n"); // SUBMIT_JOB command_queue.Rewind(); while (command_queue.Next(current_command)) { if (current_command->status != SchedDRequest::SDCS_NEW) continue; if (current_command->command != SchedDRequest::SDC_SUBMIT_JOB) continue; int ClusterId = -1; int ProcId = -1; if (qmgr_connection == NULL) { error = TRUE; goto submit_report_result; } errno = 0; BeginTransaction(); if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; failure_errno = errno; goto contact_schedd_disconnect; } error = FALSE; if ((ClusterId = NewCluster()) >= 0) { ProcId = NewProc (ClusterId); } if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; failure_errno = errno; goto contact_schedd_disconnect; } if ( ClusterId < 0 ) { error = TRUE; error_msg = "Unable to create a new job cluster"; dprintf( D_ALWAYS, "%s\n", error_msg.c_str() ); } else if ( ProcId < 0 ) { error = TRUE; error_msg = "Unable to create a new job proc"; dprintf( D_ALWAYS, "%s\n", error_msg.c_str() ); } if ( ClusterId == -2 || ProcId == -2 ) { error = TRUE; error_msg = "Number of submitted jobs would exceed MAX_JOBS_SUBMITTED\n"; dprintf( D_ALWAYS, "%s\n", error_msg.c_str() ); } // Adjust the argument/environment syntax based on the version // of the schedd we are talking to. if( error == FALSE) { CondorVersionInfo version_info(dc_schedd.version()); ArgList arglist; MyString arg_error_msg; Env env_obj; MyString env_error_msg; if(!arglist.AppendArgsFromClassAd(current_command->classad,&arg_error_msg) || ! arglist.InsertArgsIntoClassAd(current_command->classad,&version_info,&arg_error_msg)) { sprintf( error_msg, "ERROR: ClassAd problem in converting arguments to syntax " "for schedd (version=%s): %s\n", dc_schedd.version() ? dc_schedd.version() : "NULL", arg_error_msg.Value()); dprintf( D_ALWAYS,"%s\n", error_msg.c_str() ); error = TRUE; } if(!env_obj.MergeFrom(current_command->classad,&env_error_msg) || !env_obj.InsertEnvIntoClassAd(current_command->classad,&env_error_msg,NULL,&version_info)) { sprintf( error_msg, "ERROR: Failed to convert environment to target syntax" " for schedd (version %s): %s\n", dc_schedd.version() ? dc_schedd.version() : "NULL", env_error_msg.Value()); dprintf( D_ALWAYS, "%s\n", error_msg.c_str() ); error = TRUE; } } if( error == FALSE ) { // See the comment in the function body of ExpandInputFileList // for an explanation of what is going on here. MyString transfer_input_error_msg; if( !FileTransfer::ExpandInputFileList( current_command->classad, transfer_input_error_msg ) ) { dprintf( D_ALWAYS, "%s\n", transfer_input_error_msg.Value() ); error = TRUE; } } if ( error == FALSE ) { current_command->classad->Assign(ATTR_CLUSTER_ID, ClusterId); current_command->classad->Assign(ATTR_PROC_ID, ProcId); // Special case for the job lease int expire_time; if ( current_command->classad->LookupInteger( ATTR_TIMER_REMOVE_CHECK, expire_time ) ) { if ( SetTimerAttribute( ClusterId, ProcId, ATTR_TIMER_REMOVE_CHECK, expire_time - time(NULL) ) == -1 ) { if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; failure_errno = errno; goto contact_schedd_disconnect; } sprintf( error_msg, "ERROR: Failed to SetTimerAttribute %s=%ld for job %d.%d", ATTR_TIMER_REMOVE_CHECK, expire_time - time(NULL), ClusterId, ProcId ); dprintf( D_ALWAYS, "%s\n", error_msg.c_str() ); error = TRUE; goto submit_report_result; } current_command->classad->Delete( ATTR_TIMER_REMOVE_CHECK ); } // Set all the classad attribute on the remote classad current_command->classad->ResetExpr(); ExprTree *tree; const char *lhstr, *rhstr; while( current_command->classad->NextExpr(lhstr, tree) ) { rhstr = ExprTreeToString( tree ); if( !lhstr || !rhstr) { sprintf( error_msg, "ERROR: ClassAd problem in Updating by constraint %s", current_command->constraint ); dprintf( D_ALWAYS, "%s\n", error_msg.c_str() ); error = TRUE; } else if( SetAttribute (ClusterId, ProcId, lhstr, rhstr) == -1 ) { if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; failure_errno = errno; goto contact_schedd_disconnect; } sprintf( error_msg, "ERROR: Failed to SetAttribute %s=%s for job %d.%d", lhstr, rhstr, ClusterId, ProcId ); dprintf( D_ALWAYS, "%s\n", error_msg.c_str() ); error = TRUE; } if (error) break; } // elihw classad } // fi error==FALSE submit_report_result: char job_id_buff[30]; sprintf (job_id_buff, "%d.%d", ClusterId, ProcId); if (error) { const char * result[] = { GAHP_RESULT_FAILURE, job_id_buff, error_msg.c_str() }; enqueue_result (current_command->request_id, result, 3); if ( qmgr_connection != NULL ) { errno = 0; AbortTransaction(); if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; failure_errno = errno; goto contact_schedd_disconnect; } } current_command->status = SchedDRequest::SDCS_COMPLETED; } else { if ( RemoteCommitTransaction() < 0 ) { failure_line_num = __LINE__; failure_errno = errno; goto contact_schedd_disconnect; } const char * result[] = { GAHP_RESULT_SUCCESS, job_id_buff, NULL }; enqueue_result (current_command->request_id, result, 3); current_command->status = SchedDRequest::SDCS_COMPLETED; } } // elihw dprintf (D_FULLDEBUG, "Processing STATUS_CONSTRAINED requests\n"); // STATUS_CONSTRAINED command_queue.Rewind(); while (command_queue.Next(current_command)) { if (current_command->status != SchedDRequest::SDCS_NEW) continue; if (current_command->command != SchedDRequest::SDC_STATUS_CONSTRAINED) continue; if (qmgr_connection != NULL) { SimpleList <MyString *> matching_ads; error = FALSE; ClassAd *next_ad; ClassAdList adlist; // Only use GetAllJobsByConstraint if remote schedd is // 6.9.5 or newer. Previous versions either did not // support this call, or they closed the Qmgmt connection // as a side-effect of this call. if( ver_info.built_since_version(6,9,5) ) { dprintf( D_FULLDEBUG, "Calling GetAllJobsByConstraint(%s)\n", current_command->constraint ); // NOTE: this could be made more efficient if we knew // the list of attributes to query. For lack of that, // we just get all attributes. GetAllJobsByConstraint( current_command->constraint, "", adlist); } else { // This is the old latency-prone method. dprintf( D_FULLDEBUG, "Calling GetNextJobByConstraint(%s)\n", current_command->constraint ); next_ad = GetNextJobByConstraint( current_command->constraint, 1 ); while( next_ad != NULL ) { adlist.Insert( next_ad ); next_ad = GetNextJobByConstraint( current_command->constraint, 0 ); } } // NOTE: ClassAdList will deallocate the ClassAds in it adlist.Rewind(); while( (next_ad=adlist.Next()) ) { MyString * da_buffer = new MyString(); // Use a ptr to avoid excessive copying if ( useXMLClassads ) { ClassAdXMLUnparser unparser; unparser.SetUseCompactSpacing(true); unparser.Unparse (next_ad, *da_buffer); } else { NewClassAdUnparser unparser; unparser.SetUseCompactSpacing(true); unparser.Unparse (next_ad, *da_buffer); } matching_ads.Append (da_buffer); } if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; failure_errno = errno; goto contact_schedd_disconnect; } // now output this list of classads into a result const char ** result = new const char* [matching_ads.Length() + 3]; std::string _ad_count; sprintf( _ad_count, "%d", matching_ads.Length() ); int count=0; result[count++] = GAHP_RESULT_SUCCESS; result[count++] = NULL; result[count++] = _ad_count.c_str(); MyString *next_string; matching_ads.Rewind(); while (matching_ads.Next(next_string)) { result[count++] = next_string->Value(); } enqueue_result (current_command->request_id, result, count); current_command->status = SchedDRequest::SDCS_COMPLETED; // Cleanup matching_ads.Rewind(); while (matching_ads.Next(next_string)) { delete next_string; } //CommitTransaction(); delete [] result; } else { const char * result[] = { GAHP_RESULT_FAILURE, error_msg.c_str(), "0" }; //RemoteCommitTransaction(); enqueue_result (current_command->request_id, result, 3); current_command->status = SchedDRequest::SDCS_COMPLETED; } } //elihw contact_schedd_disconnect: if ( qmgr_connection != NULL ) { DisconnectQ (qmgr_connection, FALSE); } if ( failure_line_num ) { // We had an error talking to the schedd. Take all of our // incomplete commands and mark them as failed. // TODO Consider retrying these commands, rather than // immediately marking them as failed. if ( failure_errno == ETIMEDOUT ) { dprintf( D_ALWAYS, "Timed out talking to schedd at line %d in " "doContactSchedd()\n", failure_line_num ); sprintf( error_msg, "Timed out talking to schedd" ); } else { dprintf( D_ALWAYS, "Error talking to schedd at line %d in " "doContactSchedd(), errno=%d (%s)\n", failure_line_num, failure_errno, strerror(failure_errno) ); sprintf( error_msg, "Error talking to schedd" ); } command_queue.Rewind(); while (command_queue.Next(current_command)) { if ( current_command->status != SchedDRequest::SDCS_NEW ) { continue; } switch( current_command->command ) { case SchedDRequest::SDC_UPDATE_JOB: case SchedDRequest::SDC_UPDATE_CONSTRAINED: { const char *result[2] = { GAHP_RESULT_FAILURE, error_msg.c_str() }; enqueue_result (current_command->request_id, result, 2); current_command->status = SchedDRequest::SDCS_COMPLETED; } break; case SchedDRequest::SDC_UPDATE_LEASE: { const char *result[3] = { GAHP_RESULT_FAILURE, error_msg.c_str(), NULL }; enqueue_result (current_command->request_id, result, 3); current_command->status = SchedDRequest::SDCS_COMPLETED; } break; case SchedDRequest::SDC_SUBMIT_JOB: { const char *result[3] = { GAHP_RESULT_FAILURE, "-1.-1", error_msg.c_str() }; enqueue_result (current_command->request_id, result, 3); current_command->status = SchedDRequest::SDCS_COMPLETED; } break; case SchedDRequest::SDC_STATUS_CONSTRAINED: { const char *result[3] = { GAHP_RESULT_FAILURE, error_msg.c_str(), "0" }; enqueue_result (current_command->request_id, result, 3); current_command->status = SchedDRequest::SDCS_COMPLETED; } break; default: // Do nothing ; } } } if ( do_reschedule ) { dc_schedd.reschedule(); } // Write all of our results to our parent. flush_results(); dprintf (D_FULLDEBUG, "Finishing doContactSchedd()\n"); // Clean up the list command_queue.Rewind(); while (command_queue.Next(current_command)) { if (current_command->status == SchedDRequest::SDCS_COMPLETED) { command_queue.DeleteCurrent(); delete current_command; } } // Come back soon.. // QUESTION: Should this always be a fixed time period? daemonCore->Reset_Timer( contactScheddTid, contact_schedd_interval ); }