void HoldJob( const char* long_reason, const char* short_reason, int reason_code, int reason_subcode ) { char subject[ BUFSIZ ]; FILE *mailer; sprintf( subject, "Condor Job %d.%d put on hold\n", Proc->id.cluster, Proc->id.proc ); if( ! JobAd ) { dprintf( D_ALWAYS, "In HoldJob() w/ NULL JobAd!\n" ); exit( JOB_SHOULD_HOLD ); } ExitReason = JOB_SHOULD_HOLD; if ( !ConnectQ(schedd, SHADOW_QMGMT_TIMEOUT) ) { dprintf( D_ALWAYS, "Failed to connect to schedd!\n" ); } SetAttributeString( Proc->id.cluster, Proc->id.proc, ATTR_HOLD_REASON, short_reason ); SetAttributeInt( Proc->id.cluster, Proc->id.proc, ATTR_HOLD_REASON_CODE, reason_code ); SetAttributeInt( Proc->id.cluster, Proc->id.proc, ATTR_HOLD_REASON_SUBCODE, reason_subcode ); if ( !DisconnectQ(0) ) { dprintf( D_ALWAYS, "Failed to commit updated job queue status!\n" ); } mailer = email_user_open(JobAd, subject); if( ! mailer ) { // User didn't want email, so just exit now with the right // value so the schedd actually puts the job on hold. dprintf( D_ALWAYS, "Job going into Hold state.\n"); dprintf( D_ALWAYS, "********** Shadow Exiting(%d) **********\n", JOB_SHOULD_HOLD); exit( JOB_SHOULD_HOLD ); } fprintf( mailer, "Your condor job " ); if( Proc->args_v1or2[0] ) { ArgList args; MyString args_string; args.AppendArgsV1or2Raw(Proc->args_v1or2[0],NULL); args.GetArgsStringForDisplay(&args_string); fprintf( mailer, "%s %s ", Proc->cmd[0], args_string.Value() ); } else { fprintf( mailer, "%s ", Proc->cmd[0] ); } fprintf( mailer, "\nis being put on hold.\n\n" ); fprintf( mailer, "%s", long_reason ); email_close(mailer); // Now that the user knows why, exit with the right code. dprintf( D_ALWAYS, "Job going into Hold state.\n"); dprintf( D_ALWAYS, "********** Shadow Exiting(%d) **********\n", JOB_SHOULD_HOLD); exit( JOB_SHOULD_HOLD ); }
int InitJobAd(int cluster, int proc) { if (!JobAd) { // just get the job ad from the schedd once if ( IpcFile ) { // get the jobad file a file priv_state priv = set_condor_priv(); bool is_stdin = false; FILE* ipc_fp = NULL; if( IpcFile[0] == '-' && IpcFile[1] == '\0' ) { ipc_fp = stdin; is_stdin = true; } else { ipc_fp = safe_fopen_wrapper(IpcFile,"r"); } dprintf( D_FULLDEBUG, "Reading job ClassAd from %s\n", is_stdin ? "STDIN" : IpcFile ); int isEOF = 0; int isError = 0; int isEmpty = 0; if ( ipc_fp ) { JobAd = new ClassAd(ipc_fp,"***",isEOF,isError,isEmpty); if( ! is_stdin ) { fclose( ipc_fp ); unlink( IpcFile ); } } set_priv(priv); // if constructor failed, remove the JobAd, and // fallback on getting it via sockets. if (isError != 0 ) { if (JobAd) delete JobAd; JobAd = NULL; } if (JobAd) { // we're done checkForDebugging( JobAd ); return 0; } // if we made it here, we wanted to get the job ad // via the file, but failed dprintf(D_FULLDEBUG, "Failed to read job ad file %s, using socket\n", IpcFile); } if (!ConnectQ(schedd, SHADOW_QMGMT_TIMEOUT, true)) { EXCEPT("Failed to connect to schedd!"); } JobAd = GetJobAd( cluster, proc ); DisconnectQ(NULL); checkForDebugging( JobAd ); } if (!JobAd) { EXCEPT( "failed to get job ad" ); } return 0; }
int CondorQ:: fetchQueue (ClassAdList &list, StringList &attrs, ClassAd *ad, CondorError* errstack) { Qmgr_connection *qmgr; ExprTree *tree; int result; char scheddString [32]; const char *constraint; bool useFastPath = false; // make the query ad if ((result = query.makeQuery (tree)) != Q_OK) return result; constraint = ExprTreeToString( tree ); delete tree; // connect to the Q manager init(); // needed to get default connect_timeout if (ad == 0) { // local case if( !(qmgr = ConnectQ( 0, connect_timeout, true, errstack)) ) { errstack->push("TEST", 0, "FOO"); return Q_SCHEDD_COMMUNICATION_ERROR; } useFastPath = true; } else { // remote case to handle condor_globalq if (!ad->LookupString (ATTR_SCHEDD_IP_ADDR, scheddString, sizeof(scheddString))) return Q_NO_SCHEDD_IP_ADDR; if( !(qmgr = ConnectQ( scheddString, connect_timeout, true, errstack)) ) return Q_SCHEDD_COMMUNICATION_ERROR; } // get the ads and filter them getAndFilterAds (constraint, attrs, list, useFastPath); DisconnectQ (qmgr); return Q_OK; }
int CondorQ::fetchQueueFromHostAndProcess ( const char *host, StringList &attrs, int fetch_opts, int match_limit, condor_q_process_func process_func, void * process_func_data, int useFastPath, CondorError* errstack, ClassAd ** psummary_ad) { Qmgr_connection *qmgr; ExprTree *tree; char *constraint; int result; // make the query ad if ((result = query.makeQuery (tree)) != Q_OK) return result; constraint = strdup( ExprTreeToString( tree ) ); delete tree; if (useFastPath > 1) { int result = fetchQueueFromHostAndProcessV2(host, constraint, attrs, fetch_opts, match_limit, process_func, process_func_data, connect_timeout, useFastPath, errstack, psummary_ad); free( constraint); return result; } if (fetch_opts != fetch_Jobs) { free( constraint ); return Q_UNSUPPORTED_OPTION_ERROR; } /* connect to the Q manager. use a timeout of 20 seconds, and a read-only connection. why 20 seconds? because careful research by Derek has shown that whenever one needs a periodic time value, 20 is always optimal. :^). */ init(); // needed to get default connect_timeout if( !(qmgr = ConnectQ( host, connect_timeout, true, errstack)) ) { free( constraint ); return Q_SCHEDD_COMMUNICATION_ERROR; } // get the ads and filter them result = getFilterAndProcessAds (constraint, attrs, match_limit, process_func, process_func_data, useFastPath); DisconnectQ (qmgr); free( constraint ); return result; }
bool ActualScheddQ::Connect(DCSchedd & MySchedd, CondorError & errstack) { if (qmgr) return true; qmgr = ConnectQ(MySchedd.addr(), 0 /* default */, false /* default */, &errstack, NULL, MySchedd.version()); allows_late = has_late = false; if (qmgr) { CondorVersionInfo cvi(MySchedd.version()); if (cvi.built_since_version(8,7,1)) { has_late = true; allows_late = param_boolean("SCHEDD_ALLOW_LATE_MATERIALIZE",has_late); } } return qmgr != NULL; }
//--------------------------------------------------------------------------- Qmgr_connection * DagmanClassad::OpenConnection() { // Open job queue CondorError errstack; Qmgr_connection *queue = ConnectQ( _schedd->addr(), 0, false, &errstack, NULL, _schedd->version() ); if ( !queue ) { debug_printf( DEBUG_QUIET, "WARNING: failed to connect to queue manager (%s)\n", errstack.getFullText().c_str() ); check_warning_strictness( DAG_STRICT_3 ); return NULL; } return queue; }
int CondorQ:: fetchQueueFromHost (ClassAdList &list, StringList &attrs, const char *host, char const *schedd_version, CondorError* errstack) { Qmgr_connection *qmgr; ExprTree *tree; const char *constraint; int result; // make the query ad if ((result = query.makeQuery (tree)) != Q_OK) return result; constraint = ExprTreeToString( tree ); delete tree; /* connect to the Q manager. use a timeout of 20 seconds, and a read-only connection. why 20 seconds? because careful research by Derek has shown that whenever one needs a periodic time value, 20 is always optimal. :^). */ init(); // needed to get default connect_timeout if( !(qmgr = ConnectQ( host, connect_timeout, true, errstack)) ) return Q_SCHEDD_COMMUNICATION_ERROR; int useFastPath = 0; if( schedd_version && *schedd_version ) { CondorVersionInfo v(schedd_version); useFastPath = v.built_since_version(6,9,3) ? 1 : 0; if (v.built_since_version(8, 1, 5)) { useFastPath = 2; } } // get the ads and filter them result = getAndFilterAds (constraint, attrs, -1, list, useFastPath); DisconnectQ (qmgr); return result; }
int CondorQ::fetchQueueFromHostAndProcess ( const char *host, StringList &attrs, process_function process_func, bool useFastPath, CondorError* errstack) { Qmgr_connection *qmgr; ExprTree *tree; char *constraint; int result; // make the query ad if ((result = query.makeQuery (tree)) != Q_OK) return result; constraint = strdup( ExprTreeToString( tree ) ); delete tree; /* connect to the Q manager. use a timeout of 20 seconds, and a read-only connection. why 20 seconds? because careful research by Derek has shown that whenever one needs a periodic time value, 20 is always optimal. :^). */ init(); // needed to get default connect_timeout if( !(qmgr = ConnectQ( host, connect_timeout, true, errstack)) ) { free( constraint ); return Q_SCHEDD_COMMUNICATION_ERROR; } // get the ads and filter them result = getFilterAndProcessAds (constraint, attrs, process_func, useFastPath); DisconnectQ (qmgr); free( constraint ); return result; }
void doContactSchedd() { int rc; Qmgr_connection *schedd; BaseJob *curr_job; ClassAd *next_ad; char expr_buf[12000]; bool schedd_updates_complete = false; bool schedd_deletes_complete = false; bool add_remove_jobs_complete = false; bool update_jobs_complete = false; bool commit_transaction = true; int failure_line_num = 0; bool send_reschedule = false; std::string error_str = ""; StringList dirty_job_ids; char *job_id_str; PROC_ID job_id; CondorError errstack; dprintf(D_FULLDEBUG,"in doContactSchedd()\n"); initJobExprs(); contactScheddTid = TIMER_UNSET; // vacateJobs ///////////////////////////////////////////////////// if ( pendingScheddVacates.getNumElements() != 0 ) { std::string buff; StringList job_ids; VacateRequest curr_request; int result; ClassAd* rval; pendingScheddVacates.startIterations(); while ( pendingScheddVacates.iterate( curr_request ) != 0 ) { formatstr( buff, "%d.%d", curr_request.job->procID.cluster, curr_request.job->procID.proc ); job_ids.append( buff.c_str() ); } char *tmp = job_ids.print_to_string(); if ( tmp ) { dprintf( D_FULLDEBUG, "Calling vacateJobs on %s\n", tmp ); free(tmp); tmp = NULL; } rval = ScheddObj->vacateJobs( &job_ids, VACATE_FAST, &errstack ); if ( rval == NULL ) { formatstr( error_str, "vacateJobs returned NULL, CondorError: %s!", errstack.getFullText().c_str() ); goto contact_schedd_failure; } else { pendingScheddVacates.startIterations(); while ( pendingScheddVacates.iterate( curr_request ) != 0 ) { formatstr( buff, "job_%d_%d", curr_request.job->procID.cluster, curr_request.job->procID.proc ); if ( !rval->LookupInteger( buff.c_str(), result ) ) { dprintf( D_FULLDEBUG, "vacateJobs returned malformed ad\n" ); EXCEPT( "vacateJobs returned malformed ad" ); } else { dprintf( D_FULLDEBUG, " %d.%d vacate result: %d\n", curr_request.job->procID.cluster, curr_request.job->procID.proc,result); pendingScheddVacates.remove( curr_request.job->procID ); curr_request.result = (action_result_t)result; curr_request.job->SetEvaluateState(); completedScheddVacates.insert( curr_request.job->procID, curr_request ); } } delete rval; } } schedd = ConnectQ( ScheddAddr, QMGMT_TIMEOUT, false, NULL, myUserName, CondorVersion() ); if ( !schedd ) { error_str = "Failed to connect to schedd!"; goto contact_schedd_failure; } // CheckLeases ///////////////////////////////////////////////////// if ( checkLeasesSignaled ) { dprintf( D_FULLDEBUG, "querying for renewed leases\n" ); // Grab the lease attributes of all the jobs in our global hashtable. BaseJob::JobsByProcId.startIterations(); while ( BaseJob::JobsByProcId.iterate( curr_job ) != 0 ) { int new_expiration; rc = GetAttributeInt( curr_job->procID.cluster, curr_job->procID.proc, ATTR_TIMER_REMOVE_CHECK, &new_expiration ); if ( rc < 0 ) { if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; commit_transaction = false; goto contact_schedd_disconnect; } else { // This job doesn't have doesn't have a lease from // the submitter. Skip it. continue; } } curr_job->UpdateJobLeaseReceived( new_expiration ); } checkLeasesSignaled = false; } // end of handling check leases // AddJobs ///////////////////////////////////////////////////// if ( addJobsSignaled || firstScheddContact ) { int num_ads = 0; dprintf( D_FULLDEBUG, "querying for new jobs\n" ); // Make sure we grab all Globus Universe jobs (except held ones // that we previously indicated we were done with) // when we first start up in case we're recovering from a // shutdown/meltdown. // Otherwise, grab all jobs that are unheld and aren't marked as // currently being managed and aren't marked as not matched. // If JobManaged is undefined, equate it with false. // If Matched is undefined, equate it with true. // NOTE: Schedds from Condor 6.6 and earlier don't include // "(Universe==9)" in the constraint they give to the gridmanager, // so this gridmanager will pull down non-globus-universe ads, // although it won't use them. This is inefficient but not // incorrect behavior. if ( firstScheddContact ) { // Grab all jobs for us to manage. This expression is a // derivative of the expression below for new jobs. We add // "|| Managed =?= TRUE" to also get jobs our previous // incarnation was in the middle of managing when it died // (if it died unexpectedly). With the new term, the // "&& Managed =!= TRUE" from the new jobs expression becomes // superfluous (by boolean logic), so we drop it. sprintf( expr_buf, "%s && %s && ((%s && %s) || %s)", expr_schedd_job_constraint.c_str(), expr_not_completely_done.c_str(), expr_matched_or_undef.c_str(), expr_not_held.c_str(), expr_managed.c_str() ); } else { // Grab new jobs for us to manage sprintf( expr_buf, "%s && %s && %s && %s && %s", expr_schedd_job_constraint.c_str(), expr_not_completely_done.c_str(), expr_matched_or_undef.c_str(), expr_not_held.c_str(), expr_not_managed.c_str() ); } dprintf( D_FULLDEBUG,"Using constraint %s\n",expr_buf); next_ad = GetNextJobByConstraint( expr_buf, 1 ); while ( next_ad != NULL ) { PROC_ID procID; BaseJob *old_job; int job_is_matched = 1; // default to true if not in ClassAd next_ad->LookupInteger( ATTR_CLUSTER_ID, procID.cluster ); next_ad->LookupInteger( ATTR_PROC_ID, procID.proc ); bool job_is_managed = jobExternallyManaged(next_ad); next_ad->LookupBool(ATTR_JOB_MATCHED,job_is_matched); if ( BaseJob::JobsByProcId.lookup( procID, old_job ) != 0 ) { JobType *job_type = NULL; BaseJob *new_job = NULL; // job had better be either managed or matched! (or both) ASSERT( job_is_managed || job_is_matched ); if ( MustExpandJobAd( next_ad ) ) { // Get the expanded ClassAd from the schedd, which // has the GridResource filled in with info from // the matched ad. delete next_ad; next_ad = NULL; next_ad = GetJobAd(procID.cluster,procID.proc); if ( next_ad == NULL && errno == ETIMEDOUT ) { failure_line_num = __LINE__; commit_transaction = false; goto contact_schedd_disconnect; } if ( next_ad == NULL ) { // We may get here if it was not possible to expand // one of the $$() expressions. We don't want to // roll back the transaction and blow away the // hold that the schedd just put on the job, so // simply skip over this ad. dprintf(D_ALWAYS,"Failed to get expanded job ClassAd from Schedd for %d.%d. errno=%d\n",procID.cluster,procID.proc,errno); goto contact_schedd_next_add_job; } } // Search our job types for one that'll handle this job jobTypes.Rewind(); while ( jobTypes.Next( job_type ) ) { if ( job_type->AdMatchFunc( next_ad ) ) { // Found one! dprintf( D_FULLDEBUG, "Using job type %s for job %d.%d\n", job_type->Name, procID.cluster, procID.proc ); break; } } if ( job_type != NULL ) { new_job = job_type->CreateFunc( next_ad ); } else { dprintf( D_ALWAYS, "No handlers for job %d.%d\n", procID.cluster, procID.proc ); new_job = new BaseJob( next_ad ); } ASSERT(new_job); new_job->SetEvaluateState(); dprintf(D_ALWAYS,"Found job %d.%d --- inserting\n", new_job->procID.cluster,new_job->procID.proc); num_ads++; if ( !job_is_managed ) { rc = tSetAttributeString( new_job->procID.cluster, new_job->procID.proc, ATTR_JOB_MANAGED, MANAGED_EXTERNAL); if ( rc < 0 ) { failure_line_num = __LINE__; commit_transaction = false; goto contact_schedd_disconnect; } } } else { // We already know about this job, skip // But also set Managed=true on the schedd so that it won't // keep signalling us about it delete next_ad; rc = tSetAttributeString( procID.cluster, procID.proc, ATTR_JOB_MANAGED, MANAGED_EXTERNAL ); if ( rc < 0 ) { failure_line_num = __LINE__; commit_transaction = false; goto contact_schedd_disconnect; } } contact_schedd_next_add_job: next_ad = GetNextJobByConstraint( expr_buf, 0 ); } // end of while next_ad if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; commit_transaction = false; goto contact_schedd_disconnect; } dprintf(D_FULLDEBUG,"Fetched %d new job ads from schedd\n",num_ads); } // end of handling add jobs // RemoveJobs ///////////////////////////////////////////////////// // We always want to perform this check. Otherwise, we may overwrite a // REMOVED/HELD/COMPLETED status with something else below. { int num_ads = 0; dprintf( D_FULLDEBUG, "querying for removed/held jobs\n" ); // Grab jobs marked as REMOVED/COMPLETED or marked as HELD that we // haven't previously indicated that we're done with (by setting // JobManaged to "Schedd". sprintf( expr_buf, "(%s) && (%s) && (%s == %d || %s == %d || (%s == %d && %s =?= \"%s\"))", ScheddJobConstraint, expr_not_completely_done.c_str(), ATTR_JOB_STATUS, REMOVED, ATTR_JOB_STATUS, COMPLETED, ATTR_JOB_STATUS, HELD, ATTR_JOB_MANAGED, MANAGED_EXTERNAL ); dprintf( D_FULLDEBUG,"Using constraint %s\n",expr_buf); next_ad = GetNextJobByConstraint( expr_buf, 1 ); while ( next_ad != NULL ) { PROC_ID procID; BaseJob *next_job; int curr_status; next_ad->LookupInteger( ATTR_CLUSTER_ID, procID.cluster ); next_ad->LookupInteger( ATTR_PROC_ID, procID.proc ); next_ad->LookupInteger( ATTR_JOB_STATUS, curr_status ); if ( BaseJob::JobsByProcId.lookup( procID, next_job ) == 0 ) { // Should probably skip jobs we already have marked as // held or removed next_job->JobAdUpdateFromSchedd( next_ad, true ); num_ads++; } else if ( curr_status == REMOVED ) { // If we don't know about the job, act like we got an // ADD_JOBS signal from the schedd the next time we // connect, so that we'll create a Job object for it // and decide how it needs to be handled. // TODO The AddJobs and RemoveJobs queries shoule be // combined into a single query. dprintf( D_ALWAYS, "Don't know about removed job %d.%d. " "Will treat it as a new job to manage\n", procID.cluster, procID.proc ); addJobsSignaled = true; } else { dprintf( D_ALWAYS, "Don't know about held/completed job %d.%d. " "Ignoring it\n", procID.cluster, procID.proc ); } delete next_ad; next_ad = GetNextJobByConstraint( expr_buf, 0 ); } if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; commit_transaction = false; goto contact_schedd_disconnect; } dprintf(D_FULLDEBUG,"Fetched %d job ads from schedd\n",num_ads); } if ( RemoteCommitTransaction() < 0 ) { failure_line_num = __LINE__; commit_transaction = false; goto contact_schedd_disconnect; } add_remove_jobs_complete = true; // Retrieve dirty attributes ///////////////////////////////////////////////////// if ( updateJobsSignaled ) { dprintf( D_FULLDEBUG, "querying for jobs with attribute updates\n" ); sprintf( expr_buf, "%s && %s && %s && %s", expr_schedd_job_constraint.c_str(), expr_not_completely_done.c_str(), expr_not_held.c_str(), expr_managed.c_str() ); dprintf( D_FULLDEBUG,"Using constraint %s\n",expr_buf); next_ad = GetNextDirtyJobByConstraint( expr_buf, 1 ); while ( next_ad != NULL ) { ClassAd updates; char str[PROC_ID_STR_BUFLEN]; next_ad->LookupInteger( ATTR_CLUSTER_ID, job_id.cluster ); next_ad->LookupInteger( ATTR_PROC_ID, job_id.proc ); if ( GetDirtyAttributes( job_id.cluster, job_id.proc, &updates ) < 0 ) { dprintf( D_ALWAYS, "Failed to retrieve dirty attributes for job %d.%d\n", job_id.cluster, job_id.proc ); failure_line_num = __LINE__; delete next_ad; goto contact_schedd_disconnect; } else { dprintf (D_FULLDEBUG, "Retrieved updated attributes for job %d.%d\n", job_id.cluster, job_id.proc); dPrintAd(D_JOB, updates); } if ( BaseJob::JobsByProcId.lookup( job_id, curr_job ) == 0 ) { curr_job->JobAdUpdateFromSchedd( &updates, false ); ProcIdToStr( job_id, str ); dirty_job_ids.append( str ); } else { dprintf( D_ALWAYS, "Don't know about updated job %d.%d. " "Ignoring it\n", job_id.cluster, job_id.proc ); } delete next_ad; next_ad = GetNextDirtyJobByConstraint( expr_buf, 0 ); } } update_jobs_complete = true; // if ( BeginTransaction() < 0 ) { errno = 0; BeginTransaction(); if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; commit_transaction = false; goto contact_schedd_disconnect; } // requestJobStatus ///////////////////////////////////////////////////// if ( pendingJobStatus.getNumElements() != 0 ) { JobStatusRequest curr_request; pendingJobStatus.startIterations(); while ( pendingJobStatus.iterate( curr_request ) != 0 ) { int status; rc = GetAttributeInt( curr_request.job_id.cluster, curr_request.job_id.proc, ATTR_JOB_STATUS, &status ); if ( rc < 0 ) { if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; commit_transaction = false; goto contact_schedd_disconnect; } else { // The job is not in the schedd's job queue. This // probably means that the user did a condor_rm -f, // so return a job status of REMOVED. status = REMOVED; } } // return status dprintf( D_FULLDEBUG, "%d.%d job status: %d\n", curr_request.job_id.cluster, curr_request.job_id.proc, status ); pendingJobStatus.remove( curr_request.job_id ); curr_request.job_status = status; daemonCore->Reset_Timer( curr_request.tid, 0 ); completedJobStatus.insert( curr_request.job_id, curr_request ); } } // Update existing jobs ///////////////////////////////////////////////////// ScheddUpdateRequest *curr_request; pendingScheddUpdates.startIterations(); while ( pendingScheddUpdates.iterate( curr_request ) != 0 ) { curr_job = curr_request->m_job; dprintf(D_FULLDEBUG,"Updating classad values for %d.%d:\n", curr_job->procID.cluster, curr_job->procID.proc); const char *attr_name; const char *attr_value; ExprTree *expr; bool fake_job_in_queue = false; curr_job->jobAd->ResetExpr(); while ( curr_job->jobAd->NextDirtyExpr(attr_name, expr) == true && fake_job_in_queue == false ) { attr_value = ExprTreeToString( expr ); dprintf(D_FULLDEBUG," %s = %s\n",attr_name,attr_value); rc = SetAttribute( curr_job->procID.cluster, curr_job->procID.proc, attr_name, attr_value); if ( rc < 0 ) { if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; commit_transaction = false; goto contact_schedd_disconnect; } else { // The job is not in the schedd's job queue. This // probably means that the user did a condor_rm -f, // so pretend that all updates for the job succeed. // Otherwise, we'll never make forward progress on // the job. // TODO We should also fake a job status of REMOVED // to the job, so it can do what cleanup it can. fake_job_in_queue = true; break; } } } } if ( RemoteCommitTransaction() < 0 ) { failure_line_num = __LINE__; commit_transaction = false; goto contact_schedd_disconnect; } schedd_updates_complete = true; // Delete existing jobs ///////////////////////////////////////////////////// errno = 0; BeginTransaction(); if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; commit_transaction = false; goto contact_schedd_disconnect; } pendingScheddUpdates.startIterations(); while ( pendingScheddUpdates.iterate( curr_request ) != 0 ) { curr_job = curr_request->m_job; if ( curr_job->deleteFromSchedd ) { dprintf(D_FULLDEBUG,"Deleting job %d.%d from schedd\n", curr_job->procID.cluster, curr_job->procID.proc); rc = DestroyProc(curr_job->procID.cluster, curr_job->procID.proc); // NOENT means the job doesn't exist. Good enough for us. if ( rc < 0 && rc != DESTROYPROC_ENOENT) { failure_line_num = __LINE__; commit_transaction = false; goto contact_schedd_disconnect; } } } if ( RemoteCommitTransaction() < 0 ) { failure_line_num = __LINE__; commit_transaction = false; goto contact_schedd_disconnect; } schedd_deletes_complete = true; contact_schedd_disconnect: DisconnectQ( schedd, commit_transaction ); if ( add_remove_jobs_complete == true ) { firstScheddContact = false; addJobsSignaled = false; } else { formatstr( error_str, "Schedd connection error during Add/RemoveJobs at line %d!", failure_line_num ); goto contact_schedd_failure; } if ( update_jobs_complete == true ) { updateJobsSignaled = false; } else { formatstr( error_str, "Schedd connection error during dirty attribute update at line %d!", failure_line_num ); goto contact_schedd_failure; } if ( schedd_updates_complete == false ) { formatstr( error_str, "Schedd connection error during updates at line %d!", failure_line_num ); goto contact_schedd_failure; } // Clear dirty bits for all jobs updated if ( !dirty_job_ids.isEmpty() ) { ClassAd *rval; dprintf( D_FULLDEBUG, "Calling clearDirtyAttrs on %d jobs\n", dirty_job_ids.number() ); dirty_job_ids.rewind(); rval = ScheddObj->clearDirtyAttrs( &dirty_job_ids, &errstack ); if ( rval == NULL ) { dprintf(D_ALWAYS, "Failed to notify schedd to clear dirty attributes. CondorError: %s\n", errstack.getFullText().c_str() ); } delete rval; } // Wake up jobs that had schedd updates pending and delete job // objects that wanted to be deleted pendingScheddUpdates.startIterations(); while ( pendingScheddUpdates.iterate( curr_request ) != 0 ) { curr_job = curr_request->m_job; curr_job->jobAd->ClearAllDirtyFlags(); if ( curr_job->deleteFromGridmanager ) { // If the Job object wants to delete the job from the // schedd but we failed to do so, don't delete the job // object yet; wait until we successfully delete the job // from the schedd. if ( curr_job->deleteFromSchedd == true && schedd_deletes_complete == false ) { continue; } // If wantRematch is set, send a reschedule now if ( curr_job->wantRematch ) { send_reschedule = true; } pendingScheddUpdates.remove( curr_job->procID ); pendingScheddVacates.remove( curr_job->procID ); pendingJobStatus.remove( curr_job->procID ); completedJobStatus.remove( curr_job->procID ); completedScheddVacates.remove( curr_job->procID ); delete curr_job; } else { pendingScheddUpdates.remove( curr_job->procID ); if ( curr_request->m_notify ) { curr_job->SetEvaluateState(); } } delete curr_request; } // Poke objects that wanted to be notified when a schedd update completed // successfully (possibly minus deletes) int timer_id; scheddUpdateNotifications.Rewind(); while ( scheddUpdateNotifications.Next( timer_id ) ) { daemonCore->Reset_Timer( timer_id, 0 ); } scheddUpdateNotifications.Clear(); if ( send_reschedule == true ) { ScheddObj->reschedule(); } // Check if we have any jobs left to manage. If not, exit. if ( BaseJob::JobsByProcId.getNumElements() == 0 ) { dprintf( D_ALWAYS, "No jobs left, shutting down\n" ); daemonCore->Send_Signal( daemonCore->getpid(), SIGTERM ); } lastContactSchedd = time(NULL); if ( schedd_deletes_complete == false ) { error_str = "Problem using DestroyProc to delete jobs!"; goto contact_schedd_failure; } scheddFailureCount = 0; // For each job that had dirty attributes, re-evaluate the policy dirty_job_ids.rewind(); while ( (job_id_str = dirty_job_ids.next()) != NULL ) { StrToProcIdFixMe(job_id_str, job_id); if ( BaseJob::JobsByProcId.lookup( job_id, curr_job ) == 0 ) { curr_job->EvalPeriodicJobExpr(); } } dprintf(D_FULLDEBUG,"leaving doContactSchedd()\n"); return; contact_schedd_failure: scheddFailureCount++; if ( error_str == "" ) { error_str = "Failure in doContactSchedd"; } if ( scheddFailureCount >= maxScheddFailures ) { dprintf( D_ALWAYS, "%s\n", error_str.c_str() ); EXCEPT( "Too many failures connecting to schedd!" ); } dprintf( D_ALWAYS, "%s Will retry\n", error_str.c_str() ); lastContactSchedd = time(NULL); RequestContactSchedd(); return; }
void update_job_status( struct rusage *localp, struct rusage *remotep ) { int status = -1; double utime = 0.0; double stime = 0.0; int tot_sus=0, cum_sus=0, last_sus=0; char buf[1024*50]; // If the job completed, and there is no HISTORY file specified, // the don't bother to update the job ClassAd since it is about to be // flushed into the bit bucket by the schedd anyway. char *myHistoryFile = param("HISTORY"); if ((Proc->status == COMPLETED) && (myHistoryFile==NULL)) { return; } if (myHistoryFile) { free(myHistoryFile); } if (!JobAd) { EXCEPT( "update_job_status(): No job ad"); } JobAd->LookupInteger(ATTR_TOTAL_SUSPENSIONS, tot_sus); JobAd->LookupInteger(ATTR_CUMULATIVE_SUSPENSION_TIME, cum_sus); JobAd->LookupInteger(ATTR_LAST_SUSPENSION_TIME, last_sus); //new syntax, can use filesystem to authenticate if (!ConnectQ(schedd, SHADOW_QMGMT_TIMEOUT) || GetAttributeInt(Proc->id.cluster, Proc->id.proc, ATTR_JOB_STATUS, &status) < 0) { EXCEPT("Failed to connect to schedd!"); } job_report_update_queue( Proc ); if( status == REMOVED ) { dprintf( D_ALWAYS, "update_job_status(): Job %d.%d has been removed " "by condor_rm\n", Proc->id.cluster, Proc->id.proc ); } else { SetAttributeInt(Proc->id.cluster, Proc->id.proc, ATTR_TOTAL_SUSPENSIONS, tot_sus); SetAttributeInt(Proc->id.cluster, Proc->id.proc, ATTR_CUMULATIVE_SUSPENSION_TIME, cum_sus); SetAttributeInt(Proc->id.cluster, Proc->id.proc, ATTR_LAST_SUSPENSION_TIME, last_sus); update_job_rusage( localp, remotep ); Proc->image_size = ImageSize; SetAttributeInt(Proc->id.cluster, Proc->id.proc, ATTR_IMAGE_SIZE, ImageSize); // For standard universe. MemoryUsed==ImageSize, no need to param this one. // because imagesize is already the best measure of memory usage. SetAttribute(Proc->id.cluster, Proc->id.proc, ATTR_MEMORY_USAGE, "((ImageSize+1023)/1024)"); SetAttributeInt(Proc->id.cluster, Proc->id.proc, ATTR_JOB_EXIT_STATUS, JobExitStatus); rusage_to_float( Proc->local_usage, &utime, &stime ); SetAttributeFloat(Proc->id.cluster, Proc->id.proc, ATTR_JOB_LOCAL_USER_CPU, utime); SetAttributeFloat(Proc->id.cluster, Proc->id.proc, ATTR_JOB_LOCAL_SYS_CPU, stime); rusage_to_float( Proc->remote_usage[0], &utime, &stime ); SetAttributeFloat(Proc->id.cluster, Proc->id.proc, ATTR_JOB_REMOTE_USER_CPU, utime); SetAttributeFloat(Proc->id.cluster, Proc->id.proc, ATTR_JOB_REMOTE_SYS_CPU, stime); dprintf(D_FULLDEBUG,"TIME DEBUG 3 USR remotep=%lu Proc=%lu utime=%f\n", remotep->ru_utime.tv_sec, Proc->remote_usage[0].ru_utime.tv_sec, utime); dprintf(D_FULLDEBUG,"TIME DEBUG 4 SYS remotep=%lu Proc=%lu utime=%f\n", remotep->ru_stime.tv_sec, Proc->remote_usage[0].ru_stime.tv_sec, stime); if( sock_RSC1 ) { float TotalBytesSentUpdate = TotalBytesSent + sock_RSC1->get_bytes_sent() + BytesSent; float TotalBytesRecvdUpdate = TotalBytesRecvd + sock_RSC1->get_bytes_recvd() + BytesRecvd; SetAttributeFloat( Proc->id.cluster, Proc->id.proc, ATTR_BYTES_SENT, TotalBytesSentUpdate ); SetAttributeFloat( Proc->id.cluster, Proc->id.proc, ATTR_BYTES_RECVD, TotalBytesRecvdUpdate ); float RSCBytesSentUpdate = sock_RSC1->get_bytes_sent() + RSCBytesSent; float RSCBytesRecvdUpdate = sock_RSC1->get_bytes_recvd() + RSCBytesRecvd; SetAttributeFloat( Proc->id.cluster, Proc->id.proc, ATTR_RSC_BYTES_SENT, RSCBytesSentUpdate ); SetAttributeFloat( Proc->id.cluster, Proc->id.proc, ATTR_RSC_BYTES_RECVD, RSCBytesRecvdUpdate ); } if( ExitReason == JOB_CKPTED || ExitReason == JOB_NOT_CKPTED ) { SetAttributeInt( Proc->id.cluster, Proc->id.proc, ATTR_LAST_VACATE_TIME, time(0) ); } if( ExitReason == JOB_CKPTED || LastCkptTime > LastRestartTime ) { int uncommitted_suspension_time = 0; JobAd->LookupInteger(ATTR_UNCOMMITTED_SUSPENSION_TIME, uncommitted_suspension_time); if( uncommitted_suspension_time > 0 ) { int committed_suspension_time = 0; GetAttributeInt(Proc->id.cluster, Proc->id.proc, ATTR_COMMITTED_SUSPENSION_TIME, &committed_suspension_time); committed_suspension_time += uncommitted_suspension_time; SetAttributeInt(Proc->id.cluster, Proc->id.proc, ATTR_COMMITTED_SUSPENSION_TIME, committed_suspension_time); } } // if we had checkpointed, then save all of these attributes as well. if (LastCkptTime > LastRestartTime) { SetAttributeInt(Proc->id.cluster, Proc->id.proc, ATTR_LAST_CKPT_TIME, LastCkptTime); CommittedTime=0; GetAttributeInt(Proc->id.cluster, Proc->id.proc, ATTR_JOB_COMMITTED_TIME, &CommittedTime); CommittedTime += LastCkptTime - LastRestartTime; SetAttributeInt(Proc->id.cluster, Proc->id.proc, ATTR_JOB_COMMITTED_TIME, CommittedTime); LastRestartTime = LastCkptTime; SetAttributeInt(Proc->id.cluster, Proc->id.proc, ATTR_NUM_CKPTS, NumCkpts); SetAttributeInt(Proc->id.cluster, Proc->id.proc, ATTR_NUM_RESTARTS, NumRestarts); if (Executing_Arch) { SetAttributeString(Proc->id.cluster, Proc->id.proc, ATTR_CKPT_ARCH, Executing_Arch); } if (Executing_OpSys) { SetAttributeString(Proc->id.cluster, Proc->id.proc, ATTR_CKPT_OPSYS, Executing_OpSys); } // If we wrote a checkpoint, store the location in the // LastCkptServer attribute. If we didn't use a checkpoint // server (i.e., we stored it locally), then make sure // no LastCkptServer attribute is set. if (LastCkptServer) { SetAttributeString(Proc->id.cluster, Proc->id.proc, ATTR_LAST_CKPT_SERVER, LastCkptServer); } else { DeleteAttribute(Proc->id.cluster, Proc->id.proc, ATTR_LAST_CKPT_SERVER); } if (LastCkptPlatform) { SetAttributeString(Proc->id.cluster, Proc->id.proc, ATTR_LAST_CHECKPOINT_PLATFORM, LastCkptPlatform); } } // if the job completed, we should include the run-time in // committed time, since it contributed to the completion of // the job. Also, commit the exit code/signal stuff, plus any // core filenames. if (Proc->status == COMPLETED) { int exit_code, exit_signal, exit_by_signal; int pending; // update the time. CommittedTime = 0; GetAttributeInt(Proc->id.cluster, Proc->id.proc, ATTR_JOB_COMMITTED_TIME, &CommittedTime); CommittedTime += Proc->completion_date - LastRestartTime; SetAttributeInt(Proc->id.cluster, Proc->id.proc, ATTR_JOB_COMMITTED_TIME, CommittedTime); // if there is a core file, update that too. if (JobAd->LookupString(ATTR_JOB_CORE_FILENAME, buf, sizeof(buf))) { SetAttributeString(Proc->id.cluster, Proc->id.proc, ATTR_JOB_CORE_FILENAME, buf); } // only new style ads have ATTR_ON_EXIT_BY_SIGNAL, so only // SetAttribute for those types of ads if (JobAd->LookupInteger(ATTR_ON_EXIT_BY_SIGNAL, exit_by_signal)==1) { SetAttributeInt(Proc->id.cluster, Proc->id.proc, ATTR_ON_EXIT_BY_SIGNAL, exit_by_signal); if (exit_by_signal == 1) /* exited via signal */ { JobAd->LookupInteger(ATTR_ON_EXIT_SIGNAL, exit_signal); SetAttributeInt(Proc->id.cluster, Proc->id.proc, ATTR_ON_EXIT_SIGNAL, exit_signal); } else { JobAd->LookupInteger(ATTR_ON_EXIT_CODE, exit_code); SetAttributeInt(Proc->id.cluster, Proc->id.proc, ATTR_ON_EXIT_CODE, exit_code); } } // and now, let's try and mark this job as a terminate pending // job. If the job already is, then fine. We'll mark it again. if (JobAd->LookupBool(ATTR_TERMINATION_PENDING, pending)) { SetAttribute(Proc->id.cluster, Proc->id.proc, ATTR_TERMINATION_PENDING, pending?"TRUE":"FALSE"); } else { // if it isn't in the job ad, then add it to the saved ad in the // schedd. SetAttribute(Proc->id.cluster, Proc->id.proc, ATTR_TERMINATION_PENDING, "TRUE"); } // store the reason why the job is marked completed. if (JobAd->LookupString(ATTR_TERMINATION_REASON, buf, sizeof(buf))) { SetAttributeString(Proc->id.cluster, Proc->id.proc, ATTR_TERMINATION_REASON, buf); } // Set up the exit code the shadow was about to exit with to // help support the terminate pending "state". SetAttributeInt(Proc->id.cluster, Proc->id.proc, ATTR_TERMINATION_EXITREASON, ExitReason); // Put the job status as created by waitpid() into the job ad // itself. This is to implement the terminate_pending feature. It // is done like this because EVERYWHERE in this codebase we do // stuff like WIFEXITED(JobStatus) and it turns out there are no // user level macros to will one of those status values as returned // by waitpid() into existance. So, we'll put it directly into the // job ad to prevent me having to reimplement a few large functions // which deal with JobStatus directly--as it is sadly a global // variable. SetAttributeInt(Proc->id.cluster, Proc->id.proc, ATTR_WAITPID_STATUS, JobStatus); } } if (!DisconnectQ(0)) { EXCEPT("Failed to commit updated job queue status!"); } }
/* Mess up the in memory job ad with interesting statistics about suspensions */ void record_suspension_hack(unsigned int action) { char tmp[256]; int total_suspensions; int last_suspension_time; int cumulative_suspension_time; extern char *schedd; if (!JobAd) { EXCEPT("Suspension code: Non-existant JobAd"); } switch(action) { case ULOG_JOB_SUSPENDED: /* Add to ad number of suspensions */ JobAd->LookupInteger(ATTR_TOTAL_SUSPENSIONS, total_suspensions); total_suspensions++; sprintf(tmp, "%s = %d", ATTR_TOTAL_SUSPENSIONS, total_suspensions); JobAd->Insert(tmp); /* Add to ad the current suspension time */ last_suspension_time = time(NULL); sprintf(tmp, "%s = %d", ATTR_LAST_SUSPENSION_TIME, last_suspension_time); JobAd->Insert(tmp); break; case ULOG_JOB_UNSUSPENDED: { /* add in the time I spent suspended to a running total */ JobAd->LookupInteger(ATTR_CUMULATIVE_SUSPENSION_TIME, cumulative_suspension_time); JobAd->LookupInteger(ATTR_LAST_SUSPENSION_TIME, last_suspension_time); int delta = time(NULL) - last_suspension_time; cumulative_suspension_time += delta; sprintf(tmp, "%s = %d", ATTR_CUMULATIVE_SUSPENSION_TIME, cumulative_suspension_time); JobAd->Insert(tmp); int uncommitted_suspension_time = 0; JobAd->LookupInteger(ATTR_UNCOMMITTED_SUSPENSION_TIME, uncommitted_suspension_time); uncommitted_suspension_time += delta; JobAd->Assign(ATTR_UNCOMMITTED_SUSPENSION_TIME,uncommitted_suspension_time); /* set the current suspension time to zero, meaning not suspended */ last_suspension_time = 0; sprintf(tmp, "%s = %d", ATTR_LAST_SUSPENSION_TIME, last_suspension_time); JobAd->Insert(tmp); break; } default: EXCEPT("record_suspension_hack(): Action event not recognized."); break; } /* Sanity output */ JobAd->LookupInteger(ATTR_TOTAL_SUSPENSIONS, total_suspensions); dprintf(D_FULLDEBUG,"%s = %d\n", ATTR_TOTAL_SUSPENSIONS, total_suspensions); JobAd->LookupInteger(ATTR_LAST_SUSPENSION_TIME, last_suspension_time); dprintf(D_FULLDEBUG, "%s = %d\n", ATTR_LAST_SUSPENSION_TIME, last_suspension_time); JobAd->LookupInteger(ATTR_CUMULATIVE_SUSPENSION_TIME, cumulative_suspension_time); dprintf(D_FULLDEBUG, "%s = %d\n", ATTR_CUMULATIVE_SUSPENSION_TIME, cumulative_suspension_time); /* If we've been asked to perform real time updates of the suspension information, then connect to the queue and do it here. */ if (param_boolean("REAL_TIME_JOB_SUSPEND_UPDATES", false)) { dprintf( D_ALWAYS, "Updating suspension info to schedd.\n" ); if (!ConnectQ(schedd, SHADOW_QMGMT_TIMEOUT)) { /* Since these attributes aren't updated periodically, if the schedd is busy and a resume event update is lost, the the job will be marked suspended when it really isn't. The new shadow eventually corrects this via a periodic update of various calssad attributes, but I suspect it won't be corrected in the event of a bad connect here for this shadow. */ dprintf( D_ALWAYS, "Timeout connecting to schedd. Suspension update lost.\n"); return; } SetAttributeInt(Proc->id.cluster, Proc->id.proc, ATTR_TOTAL_SUSPENSIONS, total_suspensions); SetAttributeInt(Proc->id.cluster, Proc->id.proc, ATTR_CUMULATIVE_SUSPENSION_TIME, cumulative_suspension_time); SetAttributeInt(Proc->id.cluster, Proc->id.proc, ATTR_LAST_SUSPENSION_TIME, last_suspension_time); DisconnectQ(NULL); } }
int main(int argc, char *argv[]) { has_proc = false; MyString constraint; Qmgr_connection *q; int nextarg = 1, cluster=0, proc=0; bool UseConstraint = false; MyString schedd_name; MyString pool_name; ExprTree* value_expr; myDistro->Init( argc, argv ); config(); #if !defined(WIN32) install_sig_handler(SIGPIPE, SIG_IGN ); #endif if (argc < 2) { usage(argv[0]); } // if -debug is present, it must be first. sigh. if (argv[nextarg][0] == '-' && argv[nextarg][1] == 'd') { // output dprintf messages to stderror at TOOL_DEBUG level dprintf_set_tool_debug("TOOL", 0); nextarg++; } // if it is present, it must be first after debug. if (argv[nextarg][0] == '-' && argv[nextarg][1] == 'n') { nextarg++; // use the given name as the schedd name to connect to if (argc <= nextarg) { fprintf(stderr, "%s: -n requires another argument\n", argv[0]); exit(1); } schedd_name = argv[nextarg]; nextarg++; } if (argc <= nextarg) { usage(argv[0]); } // if it is present, it must be just after -n flag if (argv[nextarg][0] == '-' && argv[nextarg][1] == 'p') { nextarg++; if (argc <= nextarg) { fprintf(stderr, "%s: -pool requires another argument\n", argv[0]); exit(1); } pool_name = argv[nextarg]; nextarg++; } DCSchedd schedd((schedd_name.Length() == 0) ? NULL : schedd_name.Value(), (pool_name.Length() == 0) ? NULL : pool_name.Value()); if ( schedd.locate() == false ) { if (schedd_name == "") { fprintf( stderr, "%s: ERROR: Can't find address of local schedd\n", argv[0] ); exit(1); } if (pool_name == "") { fprintf( stderr, "%s: No such schedd named %s in local pool\n", argv[0], schedd_name.Value() ); } else { fprintf( stderr, "%s: No such schedd named %s in " "pool %s\n", argv[0], schedd_name.Value(), pool_name.Value() ); } exit(1); } // Open job queue q = ConnectQ( schedd.addr(), 0, false, NULL, NULL, schedd.version() ); if( !q ) { fprintf( stderr, "Failed to connect to queue manager %s\n", schedd.addr() ); exit(1); } if (argc <= nextarg) { usage(argv[0]); } if (isdigit(argv[nextarg][0])) { char *tmp; cluster = strtol(argv[nextarg], &tmp, 10); if (cluster <= 0) { fprintf( stderr, "Invalid cluster # from %s.\n", argv[nextarg]); exit(1); } if (*tmp == '.') { proc = strtol(tmp + 1, &tmp, 10); if (cluster <= 0) { fprintf( stderr, "Invalid proc # from %s.\n", argv[nextarg]); exit(1); } UseConstraint = false; has_proc = true; } else { constraint.formatstr("(%s == %d)", ATTR_CLUSTER_ID, cluster); UseConstraint = true; } nextarg++; } else if (!match_prefix(argv[nextarg], "-constraint")) { constraint.formatstr("(%s == \"%s\")", ATTR_OWNER, argv[nextarg]); nextarg++; UseConstraint = true; } if (argc <= nextarg) { usage(argv[0]); } while (match_prefix(argv[nextarg], "-constraint")) { if ( has_proc ){ fprintf(stderr, "condor_qedit: proc_id specified. Ignoring constraint option\n"); nextarg+=2; continue; } nextarg++; if (argc <= nextarg) { usage(argv[0]); } if ( !UseConstraint ){ constraint = argv[nextarg]; } else{ constraint = "( " + constraint + " ) && " + argv[nextarg]; } nextarg++; UseConstraint = true; } if (argc <= nextarg) { usage(argv[0]); } for (; nextarg < argc; nextarg += 2) { if (argc <= nextarg+1) { usage(argv[0]); } if (ProtectedAttribute(argv[nextarg])) { fprintf(stderr, "Update of attribute \"%s\" is not allowed.\n", argv[nextarg]); fprintf(stderr, "Transaction failed. No attributes were set.\n"); exit(1); } // Check validity of attribute-name if ( blankline(argv[nextarg]) || !IsValidAttrName(argv[nextarg]) ) { fprintf(stderr, "Update aborted, illegal attribute-name specified for attribute \"%s\".\n", argv[nextarg]); fprintf(stderr, "Transaction failed. No attributes were set.\n"); exit(1); } // Check validity of attribute-value value_expr = NULL; if ( blankline(argv[nextarg+1]) || !IsValidAttrValue(argv[nextarg+1]) || ParseClassAdRvalExpr(argv[nextarg+1], value_expr) ) { fprintf(stderr, "Update aborted, illegal attribute-value specified for attribute \"%s\".\n", argv[nextarg]); fprintf(stderr, "Transaction failed. No attributes were set.\n"); exit(1); } if (value_expr) delete value_expr; if (UseConstraint) { // Try to communicate with the newer protocol first if (SetAttributeByConstraint(constraint.Value(), argv[nextarg], argv[nextarg+1], SETDIRTY) < 0) { if (SetAttributeByConstraint(constraint.Value(), argv[nextarg], argv[nextarg+1]) < 0) { fprintf(stderr, "Failed to set attribute \"%s\" by constraint: %s\n", argv[nextarg], constraint.Value()); fprintf(stderr, "Transaction failed. No attributes were set.\n"); exit(1); } } } else { if (SetAttribute(cluster, proc, argv[nextarg], argv[nextarg+1], SETDIRTY) < 0) { fprintf(stderr, "Failed to set attribute \"%s\" for job %d.%d.\n", argv[nextarg], cluster, proc); fprintf(stderr, "Transaction failed. No attributes were set.\n"); exit(1); } } printf("Set attribute \"%s\".\n", argv[nextarg]); } if (!DisconnectQ(q)) { fprintf(stderr, "Queue transaction failed. No attributes were set.\n"); exit(1); } return 0; }
bool ActualScheddQ::Connect(DCSchedd & MySchedd, CondorError & errstack) { if (qmgr) return true; qmgr = ConnectQ(MySchedd.addr(), 0 /* default */, false /* default */, &errstack, NULL, MySchedd.version()); return qmgr != NULL; }
void doContactSchedd() { if (command_queue.IsEmpty()) { daemonCore->Reset_Timer( contactScheddTid, contact_schedd_interval ); // Come back in a min return; } dprintf(D_FULLDEBUG,"in doContactSchedd\n"); SchedDRequest * current_command = NULL; int error=FALSE; std::string error_msg; CondorError errstack; bool do_reschedule = false; int failure_line_num = 0; int failure_errno = 0; // Try connecting to schedd DCSchedd dc_schedd ( ScheddAddr, ScheddPool ); if (dc_schedd.error() || !dc_schedd.locate()) { sprintf( error_msg, "Error locating schedd %s", ScheddAddr ); dprintf( D_ALWAYS, "%s\n", error_msg.c_str() ); // If you can't connect return "Failure" on every job request command_queue.Rewind(); while (command_queue.Next(current_command)) { if (current_command->status != SchedDRequest::SDCS_NEW) continue; if (current_command->command == SchedDRequest::SDC_STATUS_CONSTRAINED) { const char * result[] = { GAHP_RESULT_FAILURE, error_msg.c_str(), "0"}; enqueue_result (current_command->request_id, result, 3); } else if (current_command->command == SchedDRequest::SDC_SUBMIT_JOB) { const char * result[] = { GAHP_RESULT_FAILURE, NULL, error_msg.c_str() }; enqueue_result (current_command->request_id, result, 3); } else if (current_command->command == SchedDRequest::SDC_UPDATE_LEASE) { const char * result[] = { GAHP_RESULT_FAILURE, error_msg.c_str(), NULL }; enqueue_result (current_command->request_id, result, 3); } else { const char * result[] = { GAHP_RESULT_FAILURE, error_msg.c_str() }; enqueue_result (current_command->request_id, result, 2); } current_command->status = SchedDRequest::SDCS_COMPLETED; } } SchedDRequest::schedd_command_type commands [] = { SchedDRequest::SDC_REMOVE_JOB, SchedDRequest::SDC_HOLD_JOB, SchedDRequest::SDC_RELEASE_JOB }; const char * command_titles [] = { "REMOVE_JOB", "HOLD_JOB", "RELEASE_JOB" }; // REMOVE // HOLD // RELEASE int i=0; while (i<3) { StringList id_list; SimpleList <SchedDRequest*> this_batch; SchedDRequest::schedd_command_type this_command = commands[i]; const char * this_action = command_titles[i]; const char * this_reason = NULL; dprintf (D_FULLDEBUG, "Processing %s requests\n", this_action); error = FALSE; // Create a batch of commands with the same command type AND the same reason command_queue.Rewind(); while (command_queue.Next(current_command)) { if (current_command->status != SchedDRequest::SDCS_NEW) continue; if (current_command->command != this_command) continue; if ((this_reason != NULL) && (strcmp (current_command->reason, this_reason) != 0)) continue; if (this_reason == NULL) this_reason = current_command->reason; char job_id_buff[30]; sprintf (job_id_buff, "%d.%d", current_command->cluster_id, current_command->proc_id); id_list.append (job_id_buff); this_batch.Append (current_command); } // If we haven't found any.... if (id_list.isEmpty()) { i++; continue; // ... then try the next command } // Perform the appropriate command on the current batch ClassAd * result_ad= NULL; if (this_command == SchedDRequest::SDC_REMOVE_JOB) { errstack.clear(); result_ad= dc_schedd.removeJobs ( &id_list, this_reason, &errstack); } else if (this_command == SchedDRequest::SDC_HOLD_JOB) { errstack.clear(); result_ad= dc_schedd.holdJobs ( &id_list, this_reason, NULL, &errstack); } else if (this_command == SchedDRequest::SDC_RELEASE_JOB) { errstack.clear(); result_ad= dc_schedd.releaseJobs ( &id_list, this_reason, &errstack); } else { EXCEPT( "Unexpected command type %d in doContactSchedd", this_command ); } // Analyze the result ad if (!result_ad) { error = TRUE; sprintf( error_msg, "Error connecting to schedd %s %s: %s", ScheddAddr, dc_schedd.addr(), errstack.getFullText() ); } else { result_ad->dPrint (D_FULLDEBUG); if ( this_command == SchedDRequest::SDC_RELEASE_JOB ) { do_reschedule = true; } } // Go through the batch again, and create responses for each request this_batch.Rewind(); while (this_batch.Next(current_command)) { // Check the result char job_id_buff[30]; if (result_ad && (error == FALSE)) { sprintf (job_id_buff, "job_%d_%d", current_command->cluster_id, current_command->proc_id); int remove_result; if (result_ad->LookupInteger (job_id_buff, remove_result)) { switch (remove_result) { case AR_ERROR: error = TRUE; error_msg = "General Error"; break; case AR_SUCCESS: error = FALSE; break; case AR_NOT_FOUND: error = TRUE; error_msg = "Job not found"; break; case AR_BAD_STATUS: error = TRUE; error_msg = "Bad job status"; break; case AR_ALREADY_DONE: error = TRUE; error_msg = "Already done"; break; case AR_PERMISSION_DENIED: error = TRUE; error_msg = "Permission denied"; break; default: error = TRUE; error_msg = "Unknown Result"; } // hctiws } else { error_msg = "Unable to get result"; } // fi lookup result for job } // fi error == FALSE if (error) { dprintf (D_ALWAYS, "Error (operation: %s) %d.%d: %s\n", this_action, current_command->cluster_id, current_command->proc_id, error_msg.c_str()); const char * result[2]; result[0] = GAHP_RESULT_FAILURE; result[1] = error_msg.c_str(); enqueue_result (current_command->request_id, result, 2); } else { dprintf (D_ALWAYS, "Succeess (operation: %s) %d.%d\n", this_action, current_command->cluster_id, current_command->proc_id); const char * result[2]; result[0] = GAHP_RESULT_SUCCESS; result[1] = NULL; enqueue_result (current_command->request_id, result, 2); } // fi error // Mark the status current_command->status = SchedDRequest::SDCS_COMPLETED; } // elihw this_batch if ( result_ad ) { delete result_ad; } } dprintf (D_FULLDEBUG, "Processing JOB_STAGE_IN requests\n"); // JOB_STAGE_IN int MAX_BATCH_SIZE=1; // This should be a config param SimpleList <SchedDRequest*> stage_in_batch; do { stage_in_batch.Clear(); command_queue.Rewind(); while (command_queue.Next(current_command)) { if (current_command->status != SchedDRequest::SDCS_NEW) continue; if (current_command->command != SchedDRequest::SDC_JOB_STAGE_IN) continue; dprintf (D_ALWAYS, "Adding %d.%d to STAGE_IN batch\n", current_command->cluster_id, current_command->proc_id); stage_in_batch.Append (current_command); if (stage_in_batch.Number() >= MAX_BATCH_SIZE) break; } if (stage_in_batch.Number() > 0) { ClassAd ** array = new ClassAd*[stage_in_batch.Number()]; i=0; stage_in_batch.Rewind(); while (stage_in_batch.Next(current_command)) { array[i++] = current_command->classad; } error = FALSE; errstack.clear(); if (!dc_schedd.spoolJobFiles( stage_in_batch.Number(), array, &errstack )) { error = TRUE; sprintf( error_msg, "Error sending files to schedd %s: %s", ScheddAddr, errstack.getFullText() ); dprintf( D_ALWAYS, "%s\n", error_msg.c_str() ); } delete [] array; stage_in_batch.Rewind(); while (stage_in_batch.Next(current_command)) { current_command->status = SchedDRequest::SDCS_COMPLETED; if (error) { const char * result[] = { GAHP_RESULT_FAILURE, error_msg.c_str() }; enqueue_result (current_command->request_id, result, 2); } else { const char * result[] = { GAHP_RESULT_SUCCESS, NULL }; enqueue_result (current_command->request_id, result, 2); } } // elihw (command_queue) } // fi has STAGE_IN requests } while (stage_in_batch.Number() > 0); dprintf (D_FULLDEBUG, "Processing JOB_STAGE_OUT requests\n"); // JOB_STAGE_OUT SimpleList <SchedDRequest*> stage_out_batch; command_queue.Rewind(); while (command_queue.Next(current_command)) { if (current_command->status != SchedDRequest::SDCS_NEW) continue; if (current_command->command != SchedDRequest::SDC_JOB_STAGE_OUT) continue; stage_out_batch.Append (current_command); } if (stage_out_batch.Number() > 0) { std::string constraint = ""; stage_out_batch.Rewind(); int jobsexpected = stage_out_batch.Number(); while (stage_out_batch.Next(current_command)) { sprintf_cat( constraint, "(ClusterId==%d&&ProcId==%d)||", current_command->cluster_id, current_command->proc_id ); } constraint += "False"; error = FALSE; errstack.clear(); int jobssent; if (!dc_schedd.receiveJobSandbox( constraint.c_str(), &errstack, &jobssent )) { error = TRUE; sprintf( error_msg, "Error receiving files from schedd %s: %s", ScheddAddr, errstack.getFullText() ); dprintf( D_ALWAYS, "%s\n", error_msg.c_str() ); } if(error == FALSE && jobssent != jobsexpected) { error = TRUE; sprintf( error_msg, "Schedd %s didn't send expected files", ScheddAddr ); dprintf (D_ALWAYS, "Transfered files for %d jobs but got files for %d jobs. (Schedd %s with contraint %s\n", jobsexpected, jobssent, ScheddAddr, constraint.c_str()); } stage_out_batch.Rewind(); while (stage_out_batch.Next(current_command)) { current_command->status = SchedDRequest::SDCS_COMPLETED; if (error) { const char * result[] = { GAHP_RESULT_FAILURE, error_msg.c_str() }; enqueue_result (current_command->request_id, result, 2); } else { const char * result[] = { GAHP_RESULT_SUCCESS, NULL }; enqueue_result (current_command->request_id, result, 2); } } // elihw (command_queue) } // fi has STAGE_OUT requests dprintf (D_FULLDEBUG, "Processing JOB_REFRESH_PROXY requests\n"); CondorVersionInfo ver_info(dc_schedd.version()); bool delegate_credential; if ( ver_info.built_since_version(6,7,19) && param_boolean( "DELEGATE_JOB_GSI_CREDENTIALS", true ) ) { delegate_credential = true; } else { delegate_credential = false; } // JOB_REFRESH_PROXY command_queue.Rewind(); while (command_queue.Next(current_command)) { if (current_command->status != SchedDRequest::SDCS_NEW) continue; if (current_command->command != SchedDRequest::SDC_JOB_REFRESH_PROXY) continue; time_t expiration_time = GetDesiredDelegatedJobCredentialExpiration(current_command->classad); time_t result_expiration_time = 0; bool result; errstack.clear(); if ( delegate_credential ) { result = dc_schedd.delegateGSIcredential( current_command->cluster_id, current_command->proc_id, current_command->proxy_file, expiration_time, &result_expiration_time, &errstack ); // Currently, we do not propagate the actual resulting // expiration time back to the gridmanager. We // probably should. } else { result = dc_schedd.updateGSIcredential( current_command->cluster_id, current_command->proc_id, current_command->proxy_file, &errstack ); } current_command->status = SchedDRequest::SDCS_COMPLETED; if (result == false) { sprintf( error_msg, "Error refreshing proxy to schedd %s: %s", ScheddAddr, errstack.getFullText() ); dprintf( D_ALWAYS, "%s\n", error_msg.c_str() ); const char * result_to_queue[] = { GAHP_RESULT_FAILURE, error_msg.c_str() }; enqueue_result (current_command->request_id, result_to_queue, 2); } else { const char * result_to_queue[] = { GAHP_RESULT_SUCCESS, NULL }; enqueue_result (current_command->request_id, result_to_queue, 2); } } // Now do all the QMGMT transactions error = FALSE; // Try connecting to the queue Qmgr_connection * qmgr_connection; if ((qmgr_connection = ConnectQ(dc_schedd.addr(), QMGMT_TIMEOUT, false, NULL, NULL, dc_schedd.version() )) == NULL) { error = TRUE; sprintf( error_msg, "Error connecting to schedd %s", ScheddAddr ); dprintf( D_ALWAYS, "%s\n", error_msg.c_str() ); } else { errno = 0; AbortTransaction(); // Just so we can call BeginTransaction() in the loop if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; failure_errno = errno; goto contact_schedd_disconnect; } } dprintf (D_FULLDEBUG, "Processing UPDATE_CONSTRAINED/UDATE_JOB requests\n"); // UPDATE_CONSTRAINED // UDATE_JOB command_queue.Rewind(); while (command_queue.Next(current_command)) { if (current_command->status != SchedDRequest::SDCS_NEW) continue; if ((current_command->command != SchedDRequest::SDC_UPDATE_CONSTRAINED) && (current_command->command != SchedDRequest::SDC_UPDATE_JOB)) continue; if (qmgr_connection == NULL) goto update_report_result; error = FALSE; errno = 0; BeginTransaction(); if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; failure_errno = errno; goto contact_schedd_disconnect; } current_command->classad->ResetExpr(); ExprTree *tree; const char *lhstr, *rhstr; while( current_command->classad->NextExpr(lhstr, tree) ) { rhstr = ExprTreeToString( tree ); if( !lhstr || !rhstr) { sprintf( error_msg, "ERROR: ClassAd problem in Updating by constraint %s", current_command->constraint ); dprintf( D_ALWAYS, "%s\n", error_msg.c_str() ); error = TRUE; } else { if (current_command->command == SchedDRequest::SDC_UPDATE_CONSTRAINED) { if( SetAttributeByConstraint(current_command->constraint, lhstr, rhstr) == -1 ) { if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; failure_errno = errno; goto contact_schedd_disconnect; } sprintf( error_msg, "ERROR: Failed (errno=%d) to SetAttributeByConstraint %s=%s for constraint %s", errno, lhstr, rhstr, current_command->constraint ); dprintf( D_ALWAYS, "%s\n", error_msg.c_str() ); error = TRUE; } } else if (current_command->command == SchedDRequest::SDC_UPDATE_JOB) { if( SetAttribute(current_command->cluster_id, current_command->proc_id, lhstr, rhstr) == -1 ) { if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; failure_errno = errno; goto contact_schedd_disconnect; } sprintf( error_msg, "ERROR: Failed to SetAttribute() %s=%s for job %d.%d", lhstr, rhstr, current_command->cluster_id, current_command->proc_id); dprintf( D_ALWAYS, "%s\n", error_msg.c_str() ); error = TRUE; } } } if (error) break; } // elihw classad update_report_result: if (error) { const char * result[] = { GAHP_RESULT_FAILURE, error_msg.c_str() }; //RemoteCommitTransaction(); enqueue_result (current_command->request_id, result, 2); current_command->status = SchedDRequest::SDCS_COMPLETED; if ( qmgr_connection != NULL ) { errno = 0; AbortTransaction(); if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; failure_errno = errno; goto contact_schedd_disconnect; } } } else { if ( RemoteCommitTransaction() < 0 ) { failure_line_num = __LINE__; failure_errno = errno; goto contact_schedd_disconnect; } const char * result[] = { GAHP_RESULT_SUCCESS, NULL }; enqueue_result (current_command->request_id, result, 2); current_command->status = SchedDRequest::SDCS_COMPLETED; } // fi } // elihw dprintf (D_FULLDEBUG, "Processing UPDATE_LEASE requests\n"); // UPDATE_LEASE command_queue.Rewind(); while (command_queue.Next(current_command)) { error = FALSE; if (current_command->status != SchedDRequest::SDCS_NEW) continue; if (current_command->command != SchedDRequest::SDC_UPDATE_LEASE) continue; std::string success_job_ids=""; if (qmgr_connection == NULL) { sprintf( error_msg, "Error connecting to schedd %s", ScheddAddr ); error = TRUE; } else { error = FALSE; errno = 0; BeginTransaction(); if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; failure_errno = errno; goto contact_schedd_disconnect; } for (i=0; i<current_command->num_jobs; i++) { time_t time_now = time(NULL); int duration = current_command->expirations[i].expiration - time_now; dprintf (D_FULLDEBUG, "Job %d.%d SetTimerAttribute=%d\n", current_command->expirations[i].cluster, current_command->expirations[i].proc, duration); if (SetTimerAttribute (current_command->expirations[i].cluster, current_command->expirations[i].proc, ATTR_TIMER_REMOVE_CHECK, duration) < 0) { if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; failure_errno = errno; goto contact_schedd_disconnect; } dprintf (D_ALWAYS, "Unable to SetTimerAttribute(%d, %d), errno=%d\n", current_command->expirations[i].cluster, current_command->expirations[i].proc, errno); } else { // Append job id to the result line if (success_job_ids.length() > 0) success_job_ids += ","; sprintf_cat( success_job_ids, "%d.%d", current_command->expirations[i].cluster, current_command->expirations[i].proc); } } //rof jobs for request } // fi error if (error) { const char * result[] = { GAHP_RESULT_FAILURE, error_msg.c_str(), NULL }; //RemoteCommitTransaction(); enqueue_result (current_command->request_id, result, 3); current_command->status = SchedDRequest::SDCS_COMPLETED; if ( qmgr_connection != NULL ) { errno = 0; AbortTransaction(); if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; failure_errno = errno; goto contact_schedd_disconnect; } } } else { if ( RemoteCommitTransaction() < 0 ) { failure_line_num = __LINE__; failure_errno = errno; goto contact_schedd_disconnect; } const char * result[] = { GAHP_RESULT_SUCCESS, NULL, success_job_ids.length()?success_job_ids.c_str():NULL }; enqueue_result (current_command->request_id, result, 3); current_command->status = SchedDRequest::SDCS_COMPLETED; } // fi } // elihw UPDATE_LEASE requests dprintf (D_FULLDEBUG, "Processing SUBMIT_JOB requests\n"); // SUBMIT_JOB command_queue.Rewind(); while (command_queue.Next(current_command)) { if (current_command->status != SchedDRequest::SDCS_NEW) continue; if (current_command->command != SchedDRequest::SDC_SUBMIT_JOB) continue; int ClusterId = -1; int ProcId = -1; if (qmgr_connection == NULL) { error = TRUE; goto submit_report_result; } errno = 0; BeginTransaction(); if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; failure_errno = errno; goto contact_schedd_disconnect; } error = FALSE; if ((ClusterId = NewCluster()) >= 0) { ProcId = NewProc (ClusterId); } if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; failure_errno = errno; goto contact_schedd_disconnect; } if ( ClusterId < 0 ) { error = TRUE; error_msg = "Unable to create a new job cluster"; dprintf( D_ALWAYS, "%s\n", error_msg.c_str() ); } else if ( ProcId < 0 ) { error = TRUE; error_msg = "Unable to create a new job proc"; dprintf( D_ALWAYS, "%s\n", error_msg.c_str() ); } if ( ClusterId == -2 || ProcId == -2 ) { error = TRUE; error_msg = "Number of submitted jobs would exceed MAX_JOBS_SUBMITTED\n"; dprintf( D_ALWAYS, "%s\n", error_msg.c_str() ); } // Adjust the argument/environment syntax based on the version // of the schedd we are talking to. if( error == FALSE) { CondorVersionInfo version_info(dc_schedd.version()); ArgList arglist; MyString arg_error_msg; Env env_obj; MyString env_error_msg; if(!arglist.AppendArgsFromClassAd(current_command->classad,&arg_error_msg) || ! arglist.InsertArgsIntoClassAd(current_command->classad,&version_info,&arg_error_msg)) { sprintf( error_msg, "ERROR: ClassAd problem in converting arguments to syntax " "for schedd (version=%s): %s\n", dc_schedd.version() ? dc_schedd.version() : "NULL", arg_error_msg.Value()); dprintf( D_ALWAYS,"%s\n", error_msg.c_str() ); error = TRUE; } if(!env_obj.MergeFrom(current_command->classad,&env_error_msg) || !env_obj.InsertEnvIntoClassAd(current_command->classad,&env_error_msg,NULL,&version_info)) { sprintf( error_msg, "ERROR: Failed to convert environment to target syntax" " for schedd (version %s): %s\n", dc_schedd.version() ? dc_schedd.version() : "NULL", env_error_msg.Value()); dprintf( D_ALWAYS, "%s\n", error_msg.c_str() ); error = TRUE; } } if( error == FALSE ) { // See the comment in the function body of ExpandInputFileList // for an explanation of what is going on here. MyString transfer_input_error_msg; if( !FileTransfer::ExpandInputFileList( current_command->classad, transfer_input_error_msg ) ) { dprintf( D_ALWAYS, "%s\n", transfer_input_error_msg.Value() ); error = TRUE; } } if ( error == FALSE ) { current_command->classad->Assign(ATTR_CLUSTER_ID, ClusterId); current_command->classad->Assign(ATTR_PROC_ID, ProcId); // Special case for the job lease int expire_time; if ( current_command->classad->LookupInteger( ATTR_TIMER_REMOVE_CHECK, expire_time ) ) { if ( SetTimerAttribute( ClusterId, ProcId, ATTR_TIMER_REMOVE_CHECK, expire_time - time(NULL) ) == -1 ) { if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; failure_errno = errno; goto contact_schedd_disconnect; } sprintf( error_msg, "ERROR: Failed to SetTimerAttribute %s=%ld for job %d.%d", ATTR_TIMER_REMOVE_CHECK, expire_time - time(NULL), ClusterId, ProcId ); dprintf( D_ALWAYS, "%s\n", error_msg.c_str() ); error = TRUE; goto submit_report_result; } current_command->classad->Delete( ATTR_TIMER_REMOVE_CHECK ); } // Set all the classad attribute on the remote classad current_command->classad->ResetExpr(); ExprTree *tree; const char *lhstr, *rhstr; while( current_command->classad->NextExpr(lhstr, tree) ) { rhstr = ExprTreeToString( tree ); if( !lhstr || !rhstr) { sprintf( error_msg, "ERROR: ClassAd problem in Updating by constraint %s", current_command->constraint ); dprintf( D_ALWAYS, "%s\n", error_msg.c_str() ); error = TRUE; } else if( SetAttribute (ClusterId, ProcId, lhstr, rhstr) == -1 ) { if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; failure_errno = errno; goto contact_schedd_disconnect; } sprintf( error_msg, "ERROR: Failed to SetAttribute %s=%s for job %d.%d", lhstr, rhstr, ClusterId, ProcId ); dprintf( D_ALWAYS, "%s\n", error_msg.c_str() ); error = TRUE; } if (error) break; } // elihw classad } // fi error==FALSE submit_report_result: char job_id_buff[30]; sprintf (job_id_buff, "%d.%d", ClusterId, ProcId); if (error) { const char * result[] = { GAHP_RESULT_FAILURE, job_id_buff, error_msg.c_str() }; enqueue_result (current_command->request_id, result, 3); if ( qmgr_connection != NULL ) { errno = 0; AbortTransaction(); if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; failure_errno = errno; goto contact_schedd_disconnect; } } current_command->status = SchedDRequest::SDCS_COMPLETED; } else { if ( RemoteCommitTransaction() < 0 ) { failure_line_num = __LINE__; failure_errno = errno; goto contact_schedd_disconnect; } const char * result[] = { GAHP_RESULT_SUCCESS, job_id_buff, NULL }; enqueue_result (current_command->request_id, result, 3); current_command->status = SchedDRequest::SDCS_COMPLETED; } } // elihw dprintf (D_FULLDEBUG, "Processing STATUS_CONSTRAINED requests\n"); // STATUS_CONSTRAINED command_queue.Rewind(); while (command_queue.Next(current_command)) { if (current_command->status != SchedDRequest::SDCS_NEW) continue; if (current_command->command != SchedDRequest::SDC_STATUS_CONSTRAINED) continue; if (qmgr_connection != NULL) { SimpleList <MyString *> matching_ads; error = FALSE; ClassAd *next_ad; ClassAdList adlist; // Only use GetAllJobsByConstraint if remote schedd is // 6.9.5 or newer. Previous versions either did not // support this call, or they closed the Qmgmt connection // as a side-effect of this call. if( ver_info.built_since_version(6,9,5) ) { dprintf( D_FULLDEBUG, "Calling GetAllJobsByConstraint(%s)\n", current_command->constraint ); // NOTE: this could be made more efficient if we knew // the list of attributes to query. For lack of that, // we just get all attributes. GetAllJobsByConstraint( current_command->constraint, "", adlist); } else { // This is the old latency-prone method. dprintf( D_FULLDEBUG, "Calling GetNextJobByConstraint(%s)\n", current_command->constraint ); next_ad = GetNextJobByConstraint( current_command->constraint, 1 ); while( next_ad != NULL ) { adlist.Insert( next_ad ); next_ad = GetNextJobByConstraint( current_command->constraint, 0 ); } } // NOTE: ClassAdList will deallocate the ClassAds in it adlist.Rewind(); while( (next_ad=adlist.Next()) ) { MyString * da_buffer = new MyString(); // Use a ptr to avoid excessive copying if ( useXMLClassads ) { ClassAdXMLUnparser unparser; unparser.SetUseCompactSpacing(true); unparser.Unparse (next_ad, *da_buffer); } else { NewClassAdUnparser unparser; unparser.SetUseCompactSpacing(true); unparser.Unparse (next_ad, *da_buffer); } matching_ads.Append (da_buffer); } if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; failure_errno = errno; goto contact_schedd_disconnect; } // now output this list of classads into a result const char ** result = new const char* [matching_ads.Length() + 3]; std::string _ad_count; sprintf( _ad_count, "%d", matching_ads.Length() ); int count=0; result[count++] = GAHP_RESULT_SUCCESS; result[count++] = NULL; result[count++] = _ad_count.c_str(); MyString *next_string; matching_ads.Rewind(); while (matching_ads.Next(next_string)) { result[count++] = next_string->Value(); } enqueue_result (current_command->request_id, result, count); current_command->status = SchedDRequest::SDCS_COMPLETED; // Cleanup matching_ads.Rewind(); while (matching_ads.Next(next_string)) { delete next_string; } //CommitTransaction(); delete [] result; } else { const char * result[] = { GAHP_RESULT_FAILURE, error_msg.c_str(), "0" }; //RemoteCommitTransaction(); enqueue_result (current_command->request_id, result, 3); current_command->status = SchedDRequest::SDCS_COMPLETED; } } //elihw contact_schedd_disconnect: if ( qmgr_connection != NULL ) { DisconnectQ (qmgr_connection, FALSE); } if ( failure_line_num ) { // We had an error talking to the schedd. Take all of our // incomplete commands and mark them as failed. // TODO Consider retrying these commands, rather than // immediately marking them as failed. if ( failure_errno == ETIMEDOUT ) { dprintf( D_ALWAYS, "Timed out talking to schedd at line %d in " "doContactSchedd()\n", failure_line_num ); sprintf( error_msg, "Timed out talking to schedd" ); } else { dprintf( D_ALWAYS, "Error talking to schedd at line %d in " "doContactSchedd(), errno=%d (%s)\n", failure_line_num, failure_errno, strerror(failure_errno) ); sprintf( error_msg, "Error talking to schedd" ); } command_queue.Rewind(); while (command_queue.Next(current_command)) { if ( current_command->status != SchedDRequest::SDCS_NEW ) { continue; } switch( current_command->command ) { case SchedDRequest::SDC_UPDATE_JOB: case SchedDRequest::SDC_UPDATE_CONSTRAINED: { const char *result[2] = { GAHP_RESULT_FAILURE, error_msg.c_str() }; enqueue_result (current_command->request_id, result, 2); current_command->status = SchedDRequest::SDCS_COMPLETED; } break; case SchedDRequest::SDC_UPDATE_LEASE: { const char *result[3] = { GAHP_RESULT_FAILURE, error_msg.c_str(), NULL }; enqueue_result (current_command->request_id, result, 3); current_command->status = SchedDRequest::SDCS_COMPLETED; } break; case SchedDRequest::SDC_SUBMIT_JOB: { const char *result[3] = { GAHP_RESULT_FAILURE, "-1.-1", error_msg.c_str() }; enqueue_result (current_command->request_id, result, 3); current_command->status = SchedDRequest::SDCS_COMPLETED; } break; case SchedDRequest::SDC_STATUS_CONSTRAINED: { const char *result[3] = { GAHP_RESULT_FAILURE, error_msg.c_str(), "0" }; enqueue_result (current_command->request_id, result, 3); current_command->status = SchedDRequest::SDCS_COMPLETED; } break; default: // Do nothing ; } } } if ( do_reschedule ) { dc_schedd.reschedule(); } // Write all of our results to our parent. flush_results(); dprintf (D_FULLDEBUG, "Finishing doContactSchedd()\n"); // Clean up the list command_queue.Rewind(); while (command_queue.Next(current_command)) { if (current_command->status == SchedDRequest::SDCS_COMPLETED) { command_queue.DeleteCurrent(); delete current_command; } } // Come back soon.. // QUESTION: Should this always be a fixed time period? daemonCore->Reset_Timer( contactScheddTid, contact_schedd_interval ); }