bool BaseJob::SetRemoteJobStatus( const char *job_status ) { std::string old_job_status; std::string new_job_status; if ( job_status ) { m_lastRemoteStatusUpdate = time(NULL); jobAd->Assign( ATTR_LAST_REMOTE_STATUS_UPDATE, m_lastRemoteStatusUpdate ); requestScheddUpdate( this, false ); if ( m_currentStatusUnknown == true ) { m_currentStatusUnknown = false; jobAd->Assign( ATTR_CURRENT_STATUS_UNKNOWN, false ); WriteJobStatusKnownEventToUserLog( jobAd ); } } jobAd->LookupString( ATTR_GRID_JOB_STATUS, old_job_status ); if ( job_status != NULL && job_status[0] != '\0' ) { new_job_status = job_status; } if ( old_job_status == new_job_status ) { return false; } if ( !old_job_status.empty() ) { jobAd->AssignExpr( ATTR_GRID_JOB_STATUS, "Undefined" ); } if ( !new_job_status.empty() ) { jobAd->Assign( ATTR_GRID_JOB_STATUS, new_job_status.c_str() ); } requestScheddUpdate( this, false ); return true; }
void BaseJob::SetRemoteJobId( const char *job_id ) { std::string old_job_id; std::string new_job_id; jobAd->LookupString( ATTR_GRID_JOB_ID, old_job_id ); if ( job_id != NULL && job_id[0] != '\0' ) { new_job_id = job_id; } if ( old_job_id == new_job_id ) { return; } if ( !old_job_id.empty() ) { JobsByRemoteId.remove( HashKey( old_job_id.c_str() ) ); jobAd->AssignExpr( ATTR_GRID_JOB_ID, "Undefined" ); } else { // old job id was NULL m_lastRemoteStatusUpdate = time(NULL); jobAd->Assign( ATTR_LAST_REMOTE_STATUS_UPDATE, m_lastRemoteStatusUpdate ); } if ( !new_job_id.empty() ) { JobsByRemoteId.insert( HashKey( new_job_id.c_str() ), this ); jobAd->Assign( ATTR_GRID_JOB_ID, new_job_id.c_str() ); } else { // new job id is NULL m_lastRemoteStatusUpdate = 0; jobAd->Assign( ATTR_LAST_REMOTE_STATUS_UPDATE, 0 ); m_currentStatusUnknown = false; jobAd->Assign( ATTR_CURRENT_STATUS_UNKNOWN, false ); } requestScheddUpdate( this, false ); }
void BaseJob::DoneWithJob() { deleteFromGridmanager = true; switch(condorState) { case COMPLETED: { // I never want to see this job again. jobAd->Assign( ATTR_JOB_MANAGED, MANAGED_DONE ); if ( writeUserLog && !terminateLogged ) { WriteTerminateEventToUserLog( jobAd ); EmailTerminateEvent( jobAd, exitStatusKnown ); terminateLogged = true; } deleteFromSchedd = true; } break; case REMOVED: { // I never want to see this job again. jobAd->Assign( ATTR_JOB_MANAGED, MANAGED_DONE ); if ( writeUserLog && !abortLogged ) { WriteAbortEventToUserLog( jobAd ); abortLogged = true; } deleteFromSchedd = true; } break; case HELD: { jobAd->Assign( ATTR_JOB_MANAGED, MANAGED_SCHEDD ); if ( writeUserLog && !holdLogged ) { WriteHoldEventToUserLog( jobAd ); holdLogged = true; } } break; case IDLE: { jobAd->Assign( ATTR_JOB_MANAGED, MANAGED_SCHEDD ); } break; default: { EXCEPT("BaseJob::DoneWithJob called with unexpected state %s (%d)", getJobStatusString(condorState), condorState); } break; } requestScheddUpdate( this, false ); }
void BaseJob::JobRemoved( const char *remove_reason ) { if ( condorState != REMOVED ) { condorState = REMOVED; jobAd->Assign( ATTR_JOB_STATUS, condorState ); jobAd->Assign( ATTR_ENTERED_CURRENT_STATUS, (int)time(NULL) ); jobAd->Assign( ATTR_REMOVE_REASON, remove_reason ); UpdateRuntimeStats(); requestScheddUpdate( this, false ); } }
void BaseJob::JobCompleted() { if ( condorState != COMPLETED && condorState != HELD && condorState != REMOVED ) { condorState = COMPLETED; jobAd->Assign( ATTR_JOB_STATUS, condorState ); jobAd->Assign( ATTR_ENTERED_CURRENT_STATUS, (int)time(NULL) ); UpdateRuntimeStats(); requestScheddUpdate( this, false ); } }
void BaseJob::UpdateRuntimeStats() { if ( calcRuntimeStats == false ) { return; } // Adjust run time for condor_q int shadowBirthdate = 0; jobAd->LookupInteger( ATTR_SHADOW_BIRTHDATE, shadowBirthdate ); if ( condorState == RUNNING && shadowBirthdate == 0 ) { // The job has started a new interval of running int current_time = (int)time(NULL); jobAd->Assign( ATTR_SHADOW_BIRTHDATE, current_time ); int num_job_starts = 0; jobAd->LookupInteger( ATTR_NUM_JOB_STARTS, num_job_starts ); num_job_starts++; jobAd->Assign( ATTR_NUM_JOB_STARTS, num_job_starts ); requestScheddUpdate( this, false ); } else if ( condorState != RUNNING && shadowBirthdate != 0 ) { // The job has stopped an interval of running, add the current // interval to the accumulated total run time float accum_time = 0; jobAd->LookupFloat( ATTR_JOB_REMOTE_WALL_CLOCK, accum_time ); accum_time += (float)( time(NULL) - shadowBirthdate ); jobAd->Assign( ATTR_JOB_REMOTE_WALL_CLOCK, accum_time ); jobAd->Assign( ATTR_JOB_WALL_CLOCK_CKPT,(char *)NULL ); jobAd->AssignExpr( ATTR_SHADOW_BIRTHDATE, "UNDEFINED" ); requestScheddUpdate( this, false ); } }
void BaseJob::JobHeld( const char *hold_reason, int hold_code, int hold_sub_code ) { bool nonessential = false; jobAd->LookupBool(ATTR_JOB_NONESSENTIAL,nonessential); if ( nonessential ) { // don't have the gridmanager put a job on hold if // the job is nonessential. instead, just remove it. JobRemoved(hold_reason); return; } if ( condorState != HELD ) { // if the job was in REMOVED state, make certain we return // to the removed state when it is released. if ( condorState == REMOVED ) { jobAd->Assign( ATTR_JOB_STATUS_ON_RELEASE, REMOVED ); } condorState = HELD; jobAd->Assign( ATTR_JOB_STATUS, condorState ); jobAd->Assign( ATTR_ENTERED_CURRENT_STATUS, (int)time(NULL) ); jobAd->Assign( ATTR_HOLD_REASON, hold_reason ); jobAd->Assign(ATTR_HOLD_REASON_CODE, hold_code); jobAd->Assign(ATTR_HOLD_REASON_SUBCODE, hold_sub_code); char *release_reason; if ( jobAd->LookupString( ATTR_RELEASE_REASON, &release_reason ) != 0 ) { jobAd->Assign( ATTR_LAST_RELEASE_REASON, release_reason ); free( release_reason ); } jobAd->AssignExpr( ATTR_RELEASE_REASON, "Undefined" ); int num_holds; jobAd->LookupInteger( ATTR_NUM_SYSTEM_HOLDS, num_holds ); num_holds++; jobAd->Assign( ATTR_NUM_SYSTEM_HOLDS, num_holds ); UpdateRuntimeStats(); if ( writeUserLog && !holdLogged ) { WriteHoldEventToUserLog( jobAd ); holdLogged = true; } requestScheddUpdate( this, false ); } }
void BaseJob::NotifyResourceUp() { resourceStateKnown = true; if ( resourceDown == true ) { // The GlobusResourceUp event is now deprecated WriteGlobusResourceUpEventToUserLog( jobAd ); WriteGridResourceUpEventToUserLog( jobAd ); jobAd->AssignExpr( ATTR_GRID_RESOURCE_UNAVAILABLE_TIME, "Undefined" ); requestScheddUpdate( this, false ); } resourceDown = false; if ( resourcePingPending ) { resourcePingPending = false; resourcePingComplete = true; } SetEvaluateState(); }
void BaseJob::CheckRemoteStatus() { const int stale_limit = 15*60; // TODO return time that this job status could become stale? // TODO compute stale_limit from job's poll interval? // TODO make stale_limit configurable? if ( m_lastRemoteStatusUpdate == 0 || m_currentStatusUnknown == true ) { return; } if ( time(NULL) > m_lastRemoteStatusUpdate + stale_limit ) { m_currentStatusUnknown = true; jobAd->Assign( ATTR_CURRENT_STATUS_UNKNOWN, true ); requestScheddUpdate( this, false ); WriteJobStatusUnknownEventToUserLog( jobAd ); SetEvaluateState(); } }
void BaseJob::JobRunning() { if ( condorState != RUNNING && condorState != HELD && condorState != REMOVED ) { condorState = RUNNING; jobAd->Assign( ATTR_JOB_STATUS, condorState ); jobAd->Assign( ATTR_ENTERED_CURRENT_STATUS, (int)time(NULL) ); UpdateRuntimeStats(); if ( writeUserLog && !executeLogged ) { WriteExecuteEventToUserLog( jobAd ); executeLogged = true; } requestScheddUpdate( this, false ); } }
void BaseJob::JobLeaseReceivedExpired() { dprintf(D_FULLDEBUG,"(%d.%d) BaseJob::JobLeaseReceivedExpired()\n",procID.cluster,procID.proc); if ( jobLeaseReceivedExpiredTid != TIMER_UNSET ) { daemonCore->Cancel_Timer( jobLeaseReceivedExpiredTid ); jobLeaseReceivedExpiredTid = TIMER_UNSET; } condorState = REMOVED; jobAd->Assign( ATTR_JOB_STATUS, condorState ); jobAd->Assign( ATTR_ENTERED_CURRENT_STATUS, (int)time(NULL) ); jobAd->Assign( ATTR_REMOVE_REASON, "Job lease expired" ); UpdateRuntimeStats(); requestScheddUpdate( this, false ); SetEvaluateState(); }
void BaseJob::JobIdle() { if ( condorState != IDLE && condorState != HELD && condorState != REMOVED ) { bool write_evict = (condorState==RUNNING); condorState = IDLE; jobAd->Assign( ATTR_JOB_STATUS, condorState ); jobAd->Assign( ATTR_ENTERED_CURRENT_STATUS, (int)time(NULL) ); UpdateRuntimeStats(); if( write_evict ) { WriteEvictEventToUserLog( jobAd ); executeLogged = false; } requestScheddUpdate( this, false ); } }
void BaseJob::UpdateJobLeaseSent( int new_expiration_time ) { dprintf(D_FULLDEBUG,"(%d.%d) UpdateJobLeaseSent(%d)\n",procID.cluster,procID.proc,(int)new_expiration_time); int old_expiration_time = TIMER_UNSET; jobAd->LookupInteger( ATTR_JOB_LEASE_EXPIRATION, old_expiration_time ); if ( new_expiration_time <= 0 ) { new_expiration_time = TIMER_UNSET; } if ( new_expiration_time != old_expiration_time ) { if ( new_expiration_time == TIMER_UNSET ) { jobAd->AssignExpr( ATTR_JOB_LEASE_EXPIRATION, "Undefined" ); } else { jobAd->Assign( ATTR_JOB_LEASE_EXPIRATION, new_expiration_time ); } if ( old_expiration_time == TIMER_UNSET || ( new_expiration_time != TIMER_UNSET && new_expiration_time < old_expiration_time ) ) { BaseResource *resource = GetResource(); if ( resource ) { resource->RequestUpdateLeases(); } } requestScheddUpdate( this, false ); SetJobLeaseTimers(); } }
void BaseResource::UpdateLeases() { dprintf(D_FULLDEBUG,"*** UpdateLeases called\n"); if ( hasLeases == false ) { dprintf(D_FULLDEBUG," Leases not supported, cancelling timer\n" ); daemonCore->Cancel_Timer( updateLeasesTimerId ); updateLeasesTimerId = TIMER_UNSET; return; } // Don't start a new lease update too soon after the previous one. int delay; delay = (lastUpdateLeases + UPDATE_LEASE_DELAY) - time(NULL); if ( delay > 0 ) { daemonCore->Reset_Timer( updateLeasesTimerId, delay ); dprintf(D_FULLDEBUG," UpdateLeases: last update too recent, delaying %d secs\n",delay); return; } daemonCore->Reset_Timer( updateLeasesTimerId, TIMER_NEVER ); if ( updateLeasesActive == false ) { BaseJob *curr_job; time_t next_renew_time = INT_MAX; time_t job_renew_time; int min_new_expire = INT_MAX; dprintf(D_FULLDEBUG," UpdateLeases: calc'ing new leases\n"); registeredJobs.Rewind(); dprintf(D_FULLDEBUG," starting min_new_expire=%d next_renew_time=%ld\n",min_new_expire,next_renew_time); while ( registeredJobs.Next( curr_job ) ) { int new_expire; std::string job_id; job_renew_time = next_renew_time; // Don't update the lease for a job that isn't submitted // anywhere. The Job object will start the lease when it // submits the job. if ( ( m_hasSharedLeases || curr_job->jobAd->LookupString( ATTR_GRID_JOB_ID, job_id ) ) && CalculateJobLease( curr_job->jobAd, new_expire, m_defaultLeaseDuration, &job_renew_time ) ) { if ( new_expire < min_new_expire ) { min_new_expire = new_expire; } if ( !m_hasSharedLeases ) { curr_job->UpdateJobLeaseSent( new_expire ); leaseUpdates.Append( curr_job ); } } else if ( job_renew_time < next_renew_time ) { next_renew_time = job_renew_time; } dprintf(D_FULLDEBUG," after %d.%d: min_new_expire=%d next_renew_time=%ld job_renew_time=%ld\n",curr_job->procID.cluster,curr_job->procID.proc,min_new_expire,next_renew_time,job_renew_time); } if ( min_new_expire == INT_MAX || ( m_hasSharedLeases && next_renew_time < INT_MAX && m_sharedLeaseExpiration != 0 ) ) { if ( next_renew_time > time(NULL) + 3600 ) { next_renew_time = time(NULL) + 3600; } dprintf(D_FULLDEBUG," UpdateLeases: nothing to renew, resetting timer for %ld secs\n",next_renew_time - time(NULL)); lastUpdateLeases = time(NULL); daemonCore->Reset_Timer( updateLeasesTimerId, next_renew_time - time(NULL) ); } else { if ( m_hasSharedLeases ) { registeredJobs.Rewind(); while ( registeredJobs.Next( curr_job ) ) { std::string job_id; if ( curr_job->jobAd->LookupString( ATTR_GRID_JOB_ID, job_id ) ) { curr_job->UpdateJobLeaseSent( min_new_expire ); } } m_sharedLeaseExpiration = min_new_expire; dprintf(D_FULLDEBUG," new shared lease expiration at %ld, updating job ads...\n",m_sharedLeaseExpiration); } requestScheddUpdateNotification( updateLeasesTimerId ); updateLeasesActive = true; leaseAttrsSynched = false; } return; } if ( leaseAttrsSynched == false ) { bool still_dirty = false; BaseJob *curr_job; leaseUpdates.Rewind(); while ( leaseUpdates.Next( curr_job ) ) { bool exists, dirty; curr_job->jobAd->GetDirtyFlag( ATTR_JOB_LEASE_EXPIRATION, &exists, &dirty ); if ( !exists ) { // What!? The attribute disappeared? Forget about renewing // the lease then dprintf( D_ALWAYS, "Lease attribute disappeared for job %d.%d, ignoring it\n", curr_job->procID.cluster, curr_job->procID.proc ); leaseUpdates.DeleteCurrent(); } if ( dirty ) { still_dirty = true; requestScheddUpdate( curr_job, false ); } } if ( still_dirty ) { requestScheddUpdateNotification( updateLeasesTimerId ); dprintf(D_FULLDEBUG," UpdateLeases: waiting for schedd synch\n"); return; } else dprintf(D_FULLDEBUG," UpdateLeases: leases synched\n"); } leaseAttrsSynched = true; unsigned update_delay = 0; bool update_complete; SimpleList<PROC_ID> update_succeeded; bool update_success; dprintf(D_FULLDEBUG," UpdateLeases: calling DoUpdateLeases\n"); if ( m_hasSharedLeases ) { DoUpdateSharedLease( update_delay, update_complete, update_success ); } else { DoUpdateLeases( update_delay, update_complete, update_succeeded ); } if ( update_delay ) { daemonCore->Reset_Timer( updateLeasesTimerId, update_delay ); dprintf(D_FULLDEBUG," UpdateLeases: DoUpdateLeases wants delay of %uld secs\n",update_delay); return; } if ( !update_complete ) { updateLeasesCmdActive = true; dprintf(D_FULLDEBUG," UpdateLeases: DoUpdateLeases in progress\n"); return; } dprintf(D_FULLDEBUG," UpdateLeases: DoUpdateLeases complete, processing results\n"); bool first_update = lastUpdateLeases == 0; updateLeasesCmdActive = false; lastUpdateLeases = time(NULL); if ( m_hasSharedLeases ) { BaseJob *curr_job; std::string tmp; registeredJobs.Rewind(); while ( registeredJobs.Next( curr_job ) ) { if ( first_update ) { // New jobs may be waiting for the lease be to established // before they proceed with submission. curr_job->SetEvaluateState(); } if ( !curr_job->jobAd->LookupString( ATTR_GRID_JOB_ID, tmp ) ) { continue; } bool curr_renewal_failed = !update_success; bool last_renewal_failed = false; curr_job->jobAd->LookupBool( ATTR_LAST_JOB_LEASE_RENEWAL_FAILED, last_renewal_failed ); if ( curr_renewal_failed != last_renewal_failed ) { curr_job->jobAd->Assign( ATTR_LAST_JOB_LEASE_RENEWAL_FAILED, curr_renewal_failed ); requestScheddUpdate( curr_job, false ); } } } else { update_succeeded.Rewind(); PROC_ID id; std::string msg = " update_succeeded:"; while(update_succeeded.Next(id)) formatstr_cat(msg, " %d.%d", id.cluster, id.proc); dprintf(D_FULLDEBUG,"%s\n",msg.c_str()); BaseJob *curr_job; leaseUpdates.Rewind(); while ( leaseUpdates.Next( curr_job ) ) { bool curr_renewal_failed; bool last_renewal_failed = false; if ( update_succeeded.IsMember( curr_job->procID ) ) { dprintf(D_FULLDEBUG," %d.%d is in succeeded list\n",curr_job->procID.cluster,curr_job->procID.proc); curr_renewal_failed = false; } else { dprintf(D_FULLDEBUG," %d.%d is not in succeeded list\n",curr_job->procID.cluster,curr_job->procID.proc); curr_renewal_failed = true; } curr_job->jobAd->LookupBool( ATTR_LAST_JOB_LEASE_RENEWAL_FAILED, last_renewal_failed ); if ( curr_renewal_failed != last_renewal_failed ) { curr_job->jobAd->Assign( ATTR_LAST_JOB_LEASE_RENEWAL_FAILED, curr_renewal_failed ); requestScheddUpdate( curr_job, false ); } leaseUpdates.DeleteCurrent(); } } updateLeasesActive = false; dprintf(D_FULLDEBUG," UpdateLeases: lease update complete, resetting timer for 30 secs\n"); daemonCore->Reset_Timer( updateLeasesTimerId, UPDATE_LEASE_DELAY ); }
void UnicoreJob::doEvaluateState() { int old_gm_state; int old_unicore_state; bool reevaluate_state = true; time_t now = time(NULL); bool attr_exists; bool attr_dirty; int rc; daemonCore->Reset_Timer( evaluateStateTid, TIMER_NEVER ); dprintf(D_ALWAYS, "(%d.%d) doEvaluateState called: gmState %s, unicoreState %d\n", procID.cluster,procID.proc,GMStateNames[gmState],unicoreState); if ( gahp ) { gahp->setMode( GahpClient::normal ); } do { reevaluate_state = false; old_gm_state = gmState; old_unicore_state = unicoreState; ASSERT ( gahp != NULL || gmState == GM_HOLD || gmState == GM_DELETE ); switch ( gmState ) { case GM_INIT: { // This is the state all jobs start in when the GlobusJob object // is first created. Here, we do things that we didn't want to // do in the constructor because they could block (the // constructor is called while we're connected to the schedd). if ( gahp->Startup() == false ) { dprintf( D_ALWAYS, "(%d.%d) Error starting up GAHP\n", procID.cluster, procID.proc ); jobAd->Assign( ATTR_HOLD_REASON, "Failed to start GAHP" ); gmState = GM_HOLD; break; } GahpClient::mode saved_mode = gahp->getMode(); gahp->setMode( GahpClient::blocking ); rc = gahp->unicore_job_callback( UnicoreGahpCallbackHandler ); if ( rc != GLOBUS_SUCCESS ) { dprintf( D_ALWAYS, "(%d.%d) Error enabling unicore callback, err=%d\n", procID.cluster, procID.proc, rc ); jobAd->Assign( ATTR_HOLD_REASON, "Failed to initialize GAHP" ); gmState = GM_HOLD; break; } gahp->setMode( saved_mode ); gmState = GM_START; } break; case GM_START: { // This state is the real start of the state machine, after // one-time initialization has been taken care of. // If we think there's a running jobmanager // out there, we try to register for callbacks (in GM_REGISTER). // The one way jobs can end up back in this state is if we // attempt a restart of a jobmanager only to be told that the // old jobmanager process is still alive. errorString = ""; if ( jobContact == NULL ) { gmState = GM_CLEAR_REQUEST; } else if ( wantResubmit || doResubmit ) { gmState = GM_CLEAR_REQUEST; } else { if ( condorState == RUNNING ) { executeLogged = true; } gmState = GM_RECOVER; } } break; case GM_RECOVER: { // We're recovering from a crash after the job was submitted. // Allow the gahp server to recover its internal state about // the job. if ( submitAd == NULL ) { submitAd = buildSubmitAd(); } if ( submitAd == NULL ) { gmState = GM_HOLD; break; } rc = gahp->unicore_job_recover( submitAd->c_str() ); if ( rc == GAHPCLIENT_COMMAND_NOT_SUBMITTED || rc == GAHPCLIENT_COMMAND_PENDING ) { break; } if ( rc != GLOBUS_SUCCESS ) { // unhandled error dprintf(D_ALWAYS,"(%d.%d) unicore_job_recover() failed\n", procID.cluster, procID.proc); gmState = GM_CANCEL; break; } gmState = GM_SUBMITTED; } break; case GM_UNSUBMITTED: { // There are no outstanding gram submissions for this job (if // there is one, we've given up on it). if ( condorState == REMOVED ) { gmState = GM_DELETE; } else if ( condorState == HELD ) { gmState = GM_DELETE; break; } else { gmState = GM_SUBMIT; } } break; case GM_SUBMIT: { // Start a new gram submission for this job. char *job_contact = NULL; if ( condorState == REMOVED || condorState == HELD ) { gmState = GM_UNSUBMITTED; break; } if ( numSubmitAttempts >= MAX_SUBMIT_ATTEMPTS ) { jobAd->Assign( ATTR_HOLD_REASON, "Attempts to submit failed" ); gmState = GM_HOLD; break; } // After a submit, wait at least submitInterval before trying // another one. if ( now >= lastSubmitAttempt + submitInterval ) { if ( submitAd == NULL ) { submitAd = buildSubmitAd(); } if ( submitAd == NULL ) { gmState = GM_HOLD; break; } rc = gahp->unicore_job_create( submitAd->c_str(), &job_contact ); if ( rc == GAHPCLIENT_COMMAND_NOT_SUBMITTED || rc == GAHPCLIENT_COMMAND_PENDING ) { break; } lastSubmitAttempt = time(NULL); numSubmitAttempts++; if ( rc == GLOBUS_SUCCESS ) { // job_contact was strdup()ed for us. Now we take // responsibility for free()ing it. SetRemoteJobId( job_contact ); free( job_contact ); WriteGridSubmitEventToUserLog( jobAd ); gmState = GM_SUBMIT_SAVE; } else { // unhandled error dprintf(D_ALWAYS,"(%d.%d) unicore_job_create() failed\n", procID.cluster, procID.proc); dprintf(D_ALWAYS,"(%d.%d) submitAd='%s'\n", procID.cluster, procID.proc,submitAd->c_str()); if ( job_contact ) { free( job_contact ); } gmState = GM_UNSUBMITTED; reevaluate_state = true; } } else if ( condorState == REMOVED || condorState == HELD ) { gmState = GM_UNSUBMITTED; } else { unsigned int delay = 0; if ( (lastSubmitAttempt + submitInterval) > now ) { delay = (lastSubmitAttempt + submitInterval) - now; } daemonCore->Reset_Timer( evaluateStateTid, delay ); } } break; case GM_SUBMIT_SAVE: { // Save the jobmanager's contact for a new gram submission. if ( condorState == REMOVED || condorState == HELD ) { gmState = GM_CANCEL; } else { jobAd->GetDirtyFlag( ATTR_GRID_JOB_ID, &attr_exists, &attr_dirty ); if ( attr_exists && attr_dirty ) { requestScheddUpdate( this, true ); break; } gmState = GM_SUBMIT_COMMIT; } } break; case GM_SUBMIT_COMMIT: { // Now that we've saved the jobmanager's contact, commit the // gram job submission. if ( condorState == REMOVED || condorState == HELD ) { gmState = GM_CANCEL; } else { rc = gahp->unicore_job_start( jobContact ); if ( rc == GAHPCLIENT_COMMAND_NOT_SUBMITTED || rc == GAHPCLIENT_COMMAND_PENDING ) { break; } if ( rc != GLOBUS_SUCCESS ) { // unhandled error dprintf(D_ALWAYS,"(%d.%d) unicore_job_start() failed\n", procID.cluster, procID.proc); gmState = GM_CANCEL; } else { gmState = GM_SUBMITTED; } } } break; case GM_SUBMITTED: { // The job has been submitted (or is about to be by the // jobmanager). Wait for completion or failure, and probe the // jobmanager occassionally to make it's still alive. if ( unicoreState == COMPLETED ) { gmState = GM_DONE_SAVE; // } else if ( unicoreState == GLOBUS_GRAM_PROTOCOL_JOB_STATE_FAILED ) { // gmState = GM_CANCEL; } else if ( condorState == REMOVED || condorState == HELD ) { gmState = GM_CANCEL; } else if ( newRemoteStatusAd ) { dprintf(D_FULLDEBUG,"(%d.%d) *** Processing callback ad\n",procID.cluster, procID.proc ); lastProbeTime = now; UpdateUnicoreState( newRemoteStatusAd ); delete newRemoteStatusAd; newRemoteStatusAd = NULL; reevaluate_state = true; /* Now that the gahp tells us when a job status changes, we don't need to * do active probes. } else { if ( lastProbeTime < enteredCurrentGmState ) { lastProbeTime = enteredCurrentGmState; } if ( probeNow ) { lastProbeTime = 0; probeNow = false; } if ( now >= lastProbeTime + probeInterval ) { gmState = GM_PROBE_JOBMANAGER; break; } unsigned int delay = 0; if ( (lastProbeTime + probeInterval) > now ) { delay = (lastProbeTime + probeInterval) - now; } daemonCore->Reset_Timer( evaluateStateTid, delay ); */ } } break; case GM_PROBE_JOBMANAGER: { if ( condorState == REMOVED || condorState == HELD ) { gmState = GM_CANCEL; } else { char *status_ad = NULL; rc = gahp->unicore_job_status( jobContact, &status_ad ); if ( rc == GAHPCLIENT_COMMAND_NOT_SUBMITTED || rc == GAHPCLIENT_COMMAND_PENDING ) { break; } if ( rc != GLOBUS_SUCCESS ) { // unhandled error dprintf(D_ALWAYS,"(%d.%d) unicore_job_status() failed\n", procID.cluster, procID.proc); if ( status_ad ) { free( status_ad ); } gmState = GM_CANCEL; break; } UpdateUnicoreState( status_ad ); if ( status_ad ) { free( status_ad ); } if ( newRemoteStatusAd ) { delete newRemoteStatusAd; newRemoteStatusAd = NULL; } lastProbeTime = now; gmState = GM_SUBMITTED; } } break; case GM_DONE_SAVE: { // Report job completion to the schedd. JobTerminated(); if ( condorState == COMPLETED ) { jobAd->GetDirtyFlag( ATTR_JOB_STATUS, &attr_exists, &attr_dirty ); if ( attr_exists && attr_dirty ) { requestScheddUpdate( this, true ); break; } } gmState = GM_DONE_COMMIT; } break; case GM_DONE_COMMIT: { // Tell the jobmanager it can clean up and exit. rc = gahp->unicore_job_destroy( jobContact ); if ( rc == GAHPCLIENT_COMMAND_NOT_SUBMITTED || rc == GAHPCLIENT_COMMAND_PENDING ) { break; } if ( rc != GLOBUS_SUCCESS ) { // unhandled error dprintf(D_ALWAYS,"(%d.%d) unicore_job_destroy() failed\n", procID.cluster, procID.proc); gmState = GM_CANCEL; break; } if ( condorState == COMPLETED || condorState == REMOVED ) { gmState = GM_DELETE; } else { // Clear the contact string here because it may not get // cleared in GM_CLEAR_REQUEST (it might go to GM_HOLD first). // SetRemoteJobId( NULL ); // gmState = GM_CLEAR_REQUEST; gmState = GM_HOLD; } } break; case GM_CANCEL: { // We need to cancel the job submission. // if ( unicoreState != COMPLETED && // unicoreState != GLOBUS_GRAM_PROTOCOL_JOB_STATE_FAILED ) { if ( unicoreState != COMPLETED ) { rc = gahp->unicore_job_destroy( jobContact ); if ( rc == GAHPCLIENT_COMMAND_NOT_SUBMITTED || rc == GAHPCLIENT_COMMAND_PENDING ) { break; } if ( rc != GLOBUS_SUCCESS ) { // unhandled error dprintf(D_ALWAYS,"(%d.%d) unicore_job_destroy() failed\n", procID.cluster, procID.proc); gmState = GM_HOLD; break; } } if ( condorState == REMOVED ) { gmState = GM_DELETE; } else { gmState = GM_CLEAR_REQUEST; } } break; case GM_DELETE: { // We are done with the job. Propagate any remaining updates // to the schedd, then delete this object. DoneWithJob(); // This object will be deleted when the update occurs } break; case GM_CLEAR_REQUEST: { // Remove all knowledge of any previous or present job // submission, in both the gridmanager and the schedd. errorString = ""; SetRemoteJobId( NULL ); SetRemoteJobStatus( NULL ); JobIdle(); // If there are no updates to be done when we first enter this // state, requestScheddUpdate will return done immediately // and not waste time with a needless connection to the // schedd. If updates need to be made, they won't show up in // schedd_actions after the first pass through this state // because we modified our local variables the first time // through. However, since we registered update events the // first time, requestScheddUpdate won't return done until // they've been committed to the schedd. const char *name; ExprTree *expr; jobAd->ResetExpr(); if ( jobAd->NextDirtyExpr(name, expr) ) { requestScheddUpdate( this, true ); break; } executeLogged = false; terminateLogged = false; abortLogged = false; evictLogged = false; gmState = GM_UNSUBMITTED; } break; case GM_HOLD: { // Put the job on hold in the schedd. // TODO: what happens if we learn here that the job is removed? // if ( jobContact && // unicoreState != GLOBUS_GRAM_PROTOCOL_JOB_STATE_UNKNOWN ) { // unicoreState = GLOBUS_GRAM_PROTOCOL_JOB_STATE_UNKNOWN; // } // If the condor state is already HELD, then someone already // HELD it, so don't update anything else. if ( condorState != HELD ) { // Set the hold reason as best we can // TODO: set the hold reason in a more robust way. char holdReason[1024]; holdReason[0] = '\0'; holdReason[sizeof(holdReason)-1] = '\0'; jobAd->LookupString( ATTR_HOLD_REASON, holdReason, sizeof(holdReason) ); if ( holdReason[0] == '\0' && errorString != "" ) { strncpy( holdReason, errorString.c_str(), sizeof(holdReason) - 1 ); } if ( holdReason[0] == '\0' ) { strncpy( holdReason, "Unspecified gridmanager error", sizeof(holdReason) - 1 ); } JobHeld( holdReason ); } gmState = GM_DELETE; } break; default: EXCEPT( "(%d.%d) Unknown gmState %d!", procID.cluster,procID.proc, gmState ); } if ( gmState != old_gm_state || unicoreState != old_unicore_state ) { reevaluate_state = true; } if ( unicoreState != old_unicore_state ) { // dprintf(D_FULLDEBUG, "(%d.%d) globus state change: %s -> %s\n", // procID.cluster, procID.proc, // GlobusJobStatusName(old_globus_state), // GlobusJobStatusName(globusState)); enteredCurrentUnicoreState = time(NULL); } if ( gmState != old_gm_state ) { dprintf(D_FULLDEBUG, "(%d.%d) gm state change: %s -> %s\n", procID.cluster, procID.proc, GMStateNames[old_gm_state], GMStateNames[gmState]); enteredCurrentGmState = time(NULL); // If we were waiting for a pending unicore call, we're not // anymore so purge it. if ( gahp ) { gahp->purgePendingRequests(); } // If we were calling unicore_job_create and using submitAd, // we're done with it now, so free it. if ( submitAd ) { delete submitAd; submitAd = NULL; } } } while ( reevaluate_state ); }
void NordugridJob::doEvaluateState() { int old_gm_state; bool reevaluate_state = true; time_t now = time(NULL); bool attr_exists; bool attr_dirty; int rc; daemonCore->Reset_Timer( evaluateStateTid, TIMER_NEVER ); dprintf(D_ALWAYS, "(%d.%d) doEvaluateState called: gmState %s, condorState %d\n", procID.cluster,procID.proc,GMStateNames[gmState],condorState); if ( gahp ) { if ( !resourceStateKnown || resourcePingPending || resourceDown ) { gahp->setMode( GahpClient::results_only ); } else { gahp->setMode( GahpClient::normal ); } } do { reevaluate_state = false; old_gm_state = gmState; switch ( gmState ) { case GM_INIT: { // This is the state all jobs start in when the GlobusJob object // is first created. Here, we do things that we didn't want to // do in the constructor because they could block (the // constructor is called while we're connected to the schedd). if ( gahp->Startup() == false ) { dprintf( D_ALWAYS, "(%d.%d) Error starting GAHP\n", procID.cluster, procID.proc ); jobAd->Assign( ATTR_HOLD_REASON, "Failed to start GAHP" ); gmState = GM_HOLD; break; } if ( gahp->Initialize( jobProxy ) == false ) { dprintf( D_ALWAYS, "(%d.%d) Error initializing GAHP\n", procID.cluster, procID.proc ); jobAd->Assign( ATTR_HOLD_REASON, "Failed to initialize GAHP" ); gmState = GM_HOLD; break; } gahp->setDelegProxy( jobProxy ); gmState = GM_START; } break; case GM_START: { errorString = ""; if ( remoteJobId == NULL ) { gmState = GM_CLEAR_REQUEST; } else { submitLogged = true; if ( condorState == RUNNING || condorState == COMPLETED ) { executeLogged = true; } if ( remoteJobState == "" || remoteJobState == REMOTE_STATE_ACCEPTING || remoteJobState == REMOTE_STATE_ACCEPTED || remoteJobState == REMOTE_STATE_PREPARING ) { gmState = GM_RECOVER_QUERY; } else { gmState = GM_SUBMITTED; } } } break; case GM_RECOVER_QUERY: { if ( condorState == REMOVED || condorState == HELD ) { gmState = GM_CANCEL; } else { if ( m_lastRemoteStatusUpdate > enteredCurrentGmState ) { if ( remoteJobState == REMOTE_STATE_ACCEPTING || remoteJobState == REMOTE_STATE_ACCEPTED || remoteJobState == REMOTE_STATE_PREPARING ) { gmState = GM_STAGE_IN; } else { gmState = GM_SUBMITTED; } } else if ( m_currentStatusUnknown ) { gmState = GM_CANCEL; } } } break; case GM_UNSUBMITTED: { if ( condorState == REMOVED ) { gmState = GM_DELETE; } else if ( condorState == HELD ) { gmState = GM_DELETE; break; } else { gmState = GM_SUBMIT; } } break; case GM_SUBMIT: { if ( condorState == REMOVED || condorState == HELD ) { myResource->CancelSubmit( this ); gmState = GM_UNSUBMITTED; break; } if ( numSubmitAttempts >= MAX_SUBMIT_ATTEMPTS ) { // jobAd->Assign( ATTR_HOLD_REASON, // "Attempts to submit failed" ); gmState = GM_HOLD; break; } // After a submit, wait at least submitInterval before trying // another one. if ( now >= lastSubmitAttempt + submitInterval ) { char *job_id = NULL; // Once RequestSubmit() is called at least once, you must // CancelRequest() once you're done with the request call if ( myResource->RequestSubmit( this ) == false ) { break; } if ( RSL == NULL ) { RSL = buildSubmitRSL(); } if ( RSL == NULL ) { gmState = GM_HOLD; break; } rc = gahp->nordugrid_submit( resourceManagerString, RSL->c_str(), job_id ); if ( rc == GAHPCLIENT_COMMAND_NOT_SUBMITTED || rc == GAHPCLIENT_COMMAND_PENDING ) { break; } lastSubmitAttempt = time(NULL); numSubmitAttempts++; if ( rc == 0 ) { ASSERT( job_id != NULL ); SetRemoteJobId( job_id ); free( job_id ); WriteGridSubmitEventToUserLog( jobAd ); gmState = GM_SUBMIT_SAVE; } else { errorString = gahp->getErrorString(); dprintf(D_ALWAYS,"(%d.%d) job submit failed: %s\n", procID.cluster, procID.proc, errorString.c_str() ); myResource->CancelSubmit( this ); gmState = GM_UNSUBMITTED; } } else { unsigned int delay = 0; if ( (lastSubmitAttempt + submitInterval) > now ) { delay = (lastSubmitAttempt + submitInterval) - now; } daemonCore->Reset_Timer( evaluateStateTid, delay ); } } break; case GM_SUBMIT_SAVE: { if ( condorState == REMOVED || condorState == HELD ) { gmState = GM_CANCEL; } else { jobAd->GetDirtyFlag( ATTR_GRID_JOB_ID, &attr_exists, &attr_dirty ); if ( attr_exists && attr_dirty ) { requestScheddUpdate( this, true ); break; } gmState = GM_STAGE_IN; } } break; case GM_STAGE_IN: { if ( stageList == NULL ) { const char *file; stageList = buildStageInList(); stageList->rewind(); while ( (file = stageList->next()) ) { if ( IsUrl( file ) ) { stageList->deleteCurrent(); } } } rc = gahp->nordugrid_stage_in( resourceManagerString, remoteJobId, *stageList ); if ( rc == GAHPCLIENT_COMMAND_NOT_SUBMITTED || rc == GAHPCLIENT_COMMAND_PENDING ) { break; } else if ( rc != 0 ) { errorString = gahp->getErrorString(); dprintf( D_ALWAYS, "(%d.%d) file stage in failed: %s\n", procID.cluster, procID.proc, errorString.c_str() ); gmState = GM_CANCEL; } else { gmState = GM_SUBMITTED; } } break; case GM_SUBMITTED: { if ( remoteJobState == REMOTE_STATE_FINISHED || remoteJobState == REMOTE_STATE_FAILED || remoteJobState == REMOTE_STATE_KILLED || remoteJobState == REMOTE_STATE_DELETED ) { gmState = GM_EXIT_INFO; } else if ( condorState == REMOVED || condorState == HELD ) { gmState = GM_CANCEL; } else { if ( lastProbeTime < enteredCurrentGmState ) { lastProbeTime = enteredCurrentGmState; } if ( probeNow ) { lastProbeTime = 0; probeNow = false; } /* int probe_interval = myResource->GetJobPollInterval(); if ( now >= lastProbeTime + probe_interval ) { gmState = GM_PROBE_JOB; break; } unsigned int delay = 0; if ( (lastProbeTime + probe_interval) > now ) { delay = (lastProbeTime + probe_interval) - now; } daemonCore->Reset_Timer( evaluateStateTid, delay ); */ } } break; case GM_PROBE_JOB: { if ( condorState == REMOVED || condorState == HELD ) { gmState = GM_CANCEL; } else { char *new_status = NULL; rc = gahp->nordugrid_status( resourceManagerString, remoteJobId, new_status ); if ( rc == GAHPCLIENT_COMMAND_NOT_SUBMITTED || rc == GAHPCLIENT_COMMAND_PENDING ) { break; } else if ( rc != 0 ) { // What to do about failure? errorString = gahp->getErrorString(); dprintf( D_ALWAYS, "(%d.%d) job probe failed: %s\n", procID.cluster, procID.proc, errorString.c_str() ); } else { if ( new_status ) { remoteJobState = new_status; } else { remoteJobState = ""; } SetRemoteJobStatus( new_status ); } if ( new_status ) { free( new_status ); } lastProbeTime = now; gmState = GM_SUBMITTED; } } break; case GM_EXIT_INFO: { std::string filter; StringList reply; formatstr( filter, "nordugrid-job-globalid=gsiftp://%s:2811/jobs/%s", resourceManagerString, remoteJobId ); rc = gahp->nordugrid_ldap_query( resourceManagerString, "mds-vo-name=local,o=grid", filter.c_str(), "nordugrid-job-usedcputime,nordugrid-job-usedwalltime,nordugrid-job-exitcode", reply ); if ( rc == GAHPCLIENT_COMMAND_NOT_SUBMITTED || rc == GAHPCLIENT_COMMAND_PENDING ) { break; } else if ( rc != 0 ) { errorString = gahp->getErrorString(); dprintf( D_ALWAYS, "(%d.%d) exit info gathering failed: %s\n", procID.cluster, procID.proc, errorString.c_str() ); gmState = GM_CANCEL; } else { int exit_code = -1; int wallclock = -1; int cpu = -1; const char *entry; reply.rewind(); while ( (entry = reply.next()) ) { if ( !strncmp( entry, "nordugrid-job-usedcputime: ", 27 ) ) { entry = strchr( entry, ' ' ) + 1; cpu = atoi( entry ); } else if ( !strncmp( entry, "nordugrid-job-usedwalltime: ", 28 ) ) { entry = strchr( entry, ' ' ) + 1; wallclock = atoi( entry ); } else if ( !strncmp( entry, "nordugrid-job-exitcode: ", 24 ) ) { entry = strchr( entry, ' ' ) + 1; exit_code = atoi( entry ); } } if ( exit_code < 0 || wallclock < 0 || cpu < 0 ) { dprintf( D_ALWAYS, "(%d.%d) exit info missing\n", procID.cluster, procID.proc ); gmState = GM_CANCEL; break; } if ( exit_code > 128 ) { jobAd->Assign( ATTR_ON_EXIT_BY_SIGNAL, true ); jobAd->Assign( ATTR_ON_EXIT_SIGNAL, exit_code - 128 ); } else { jobAd->Assign( ATTR_ON_EXIT_BY_SIGNAL, false ); jobAd->Assign( ATTR_ON_EXIT_CODE, exit_code ); } jobAd->Assign( ATTR_JOB_REMOTE_WALL_CLOCK, wallclock * 60 ); jobAd->Assign( ATTR_JOB_REMOTE_USER_CPU, cpu * 60 ); gmState = GM_STAGE_OUT; } } break; case GM_STAGE_OUT: { if ( stageList == NULL ) { stageList = buildStageOutList(); } if ( stageLocalList == NULL ) { const char *file; stageLocalList = buildStageOutLocalList( stageList ); stageList->rewind(); stageLocalList->rewind(); while ( (file = stageLocalList->next()) ) { ASSERT( stageList->next() ); if ( IsUrl( file ) ) { stageList->deleteCurrent(); stageLocalList->deleteCurrent(); } } } rc = gahp->nordugrid_stage_out2( resourceManagerString, remoteJobId, *stageList, *stageLocalList ); if ( rc == GAHPCLIENT_COMMAND_NOT_SUBMITTED || rc == GAHPCLIENT_COMMAND_PENDING ) { break; } else if ( rc != 0 ) { errorString = gahp->getErrorString(); dprintf( D_ALWAYS, "(%d.%d) file stage out failed: %s\n", procID.cluster, procID.proc, errorString.c_str() ); gmState = GM_CANCEL; } else { gmState = GM_DONE_SAVE; } } break; case GM_DONE_SAVE: { if ( condorState != HELD && condorState != REMOVED ) { JobTerminated(); if ( condorState == COMPLETED ) { jobAd->GetDirtyFlag( ATTR_JOB_STATUS, &attr_exists, &attr_dirty ); if ( attr_exists && attr_dirty ) { requestScheddUpdate( this, true ); break; } } } gmState = GM_DONE_COMMIT; } break; case GM_DONE_COMMIT: { rc = gahp->nordugrid_cancel( resourceManagerString, remoteJobId ); if ( rc == GAHPCLIENT_COMMAND_NOT_SUBMITTED || rc == GAHPCLIENT_COMMAND_PENDING ) { break; } else if ( rc != 0 ) { errorString = gahp->getErrorString(); dprintf( D_ALWAYS, "(%d.%d) job cleanup failed: %s\n", procID.cluster, procID.proc, errorString.c_str() ); gmState = GM_HOLD; break; } myResource->CancelSubmit( this ); if ( condorState == COMPLETED || condorState == REMOVED ) { gmState = GM_DELETE; } else { // Clear the contact string here because it may not get // cleared in GM_CLEAR_REQUEST (it might go to GM_HOLD first). if ( remoteJobId != NULL ) { SetRemoteJobId( NULL ); } gmState = GM_CLEAR_REQUEST; } } break; case GM_CANCEL: { rc = gahp->nordugrid_cancel( resourceManagerString, remoteJobId ); if ( rc == GAHPCLIENT_COMMAND_NOT_SUBMITTED || rc == GAHPCLIENT_COMMAND_PENDING ) { break; } else if ( rc == 0 ) { gmState = GM_FAILED; } else { // What to do about a failed cancel? errorString = gahp->getErrorString(); dprintf( D_ALWAYS, "(%d.%d) job cancel failed: %s\n", procID.cluster, procID.proc, errorString.c_str() ); gmState = GM_FAILED; } } break; case GM_FAILED: { myResource->CancelSubmit( this ); SetRemoteJobId( NULL ); if ( condorState == REMOVED ) { gmState = GM_DELETE; } else { gmState = GM_CLEAR_REQUEST; } } break; case GM_DELETE: { // The job has completed or been removed. Delete it from the // schedd. DoneWithJob(); // This object will be deleted when the update occurs } break; case GM_CLEAR_REQUEST: { // Remove all knowledge of any previous or present job // submission, in both the gridmanager and the schedd. // If we are doing a rematch, we are simply waiting around // for the schedd to be updated and subsequently this globus job // object to be destroyed. So there is nothing to do. if ( wantRematch ) { break; } // For now, put problem jobs on hold instead of // forgetting about current submission and trying again. // TODO: Let our action here be dictated by the user preference // expressed in the job ad. if ( remoteJobId != NULL && condorState != REMOVED && wantResubmit == 0 && doResubmit == 0 ) { gmState = GM_HOLD; break; } // Only allow a rematch *if* we are also going to perform a resubmit if ( wantResubmit || doResubmit ) { jobAd->EvalBool(ATTR_REMATCH_CHECK,NULL,wantRematch); } if ( wantResubmit ) { wantResubmit = 0; dprintf(D_ALWAYS, "(%d.%d) Resubmitting to Globus because %s==TRUE\n", procID.cluster, procID.proc, ATTR_GLOBUS_RESUBMIT_CHECK ); } if ( doResubmit ) { doResubmit = 0; dprintf(D_ALWAYS, "(%d.%d) Resubmitting to Globus (last submit failed)\n", procID.cluster, procID.proc ); } errorString = ""; if ( remoteJobId != NULL ) { SetRemoteJobId( NULL ); } JobIdle(); if ( submitLogged ) { JobEvicted(); if ( !evictLogged ) { WriteEvictEventToUserLog( jobAd ); evictLogged = true; } } myResource->CancelSubmit( this ); if ( wantRematch ) { dprintf(D_ALWAYS, "(%d.%d) Requesting schedd to rematch job because %s==TRUE\n", procID.cluster, procID.proc, ATTR_REMATCH_CHECK ); // Set ad attributes so the schedd finds a new match. int dummy; if ( jobAd->LookupBool( ATTR_JOB_MATCHED, dummy ) != 0 ) { jobAd->Assign( ATTR_JOB_MATCHED, false ); jobAd->Assign( ATTR_CURRENT_HOSTS, 0 ); } // If we are rematching, we need to forget about this job // cuz we wanna pull a fresh new job ad, with a fresh new match, // from the all-singing schedd. gmState = GM_DELETE; break; } // If there are no updates to be done when we first enter this // state, requestScheddUpdate will return done immediately // and not waste time with a needless connection to the // schedd. If updates need to be made, they won't show up in // schedd_actions after the first pass through this state // because we modified our local variables the first time // through. However, since we registered update events the // first time, requestScheddUpdate won't return done until // they've been committed to the schedd. const char *name; ExprTree *expr; jobAd->ResetExpr(); if ( jobAd->NextDirtyExpr(name, expr) ) { requestScheddUpdate( this, true ); break; } if ( remoteJobState != "" ) { remoteJobState = ""; SetRemoteJobStatus( NULL ); } submitLogged = false; executeLogged = false; submitFailedLogged = false; terminateLogged = false; abortLogged = false; evictLogged = false; gmState = GM_UNSUBMITTED; } break; case GM_HOLD: { // Put the job on hold in the schedd. // TODO: what happens if we learn here that the job is removed? // If the condor state is already HELD, then someone already // HELD it, so don't update anything else. if ( condorState != HELD ) { // Set the hold reason as best we can // TODO: set the hold reason in a more robust way. char holdReason[1024]; holdReason[0] = '\0'; holdReason[sizeof(holdReason)-1] = '\0'; jobAd->LookupString( ATTR_HOLD_REASON, holdReason, sizeof(holdReason) ); if ( holdReason[0] == '\0' && errorString != "" ) { strncpy( holdReason, errorString.c_str(), sizeof(holdReason) - 1 ); } if ( holdReason[0] == '\0' ) { strncpy( holdReason, "Unspecified gridmanager error", sizeof(holdReason) - 1 ); } JobHeld( holdReason ); } gmState = GM_DELETE; } break; default: EXCEPT( "(%d.%d) Unknown gmState %d!", procID.cluster,procID.proc, gmState ); } if ( gmState != old_gm_state ) { reevaluate_state = true; dprintf(D_FULLDEBUG, "(%d.%d) gm state change: %s -> %s\n", procID.cluster, procID.proc, GMStateNames[old_gm_state], GMStateNames[gmState]); enteredCurrentGmState = time(NULL); // If we were calling a gahp call that used RSL, we're done // with it now, so free it. if ( RSL ) { delete RSL; RSL = NULL; } if ( stageList ) { delete stageList; stageList = NULL; } if ( stageLocalList ) { delete stageLocalList; stageLocalList = NULL; } } } while ( reevaluate_state ); }