Esempio n. 1
0
bool BaseJob::SetRemoteJobStatus( const char *job_status )
{
    std::string old_job_status;
    std::string new_job_status;

    if ( job_status ) {
        m_lastRemoteStatusUpdate = time(NULL);
        jobAd->Assign( ATTR_LAST_REMOTE_STATUS_UPDATE, m_lastRemoteStatusUpdate );
        requestScheddUpdate( this, false );
        if ( m_currentStatusUnknown == true ) {
            m_currentStatusUnknown = false;
            jobAd->Assign( ATTR_CURRENT_STATUS_UNKNOWN, false );
            WriteJobStatusKnownEventToUserLog( jobAd );
        }
    }

    jobAd->LookupString( ATTR_GRID_JOB_STATUS, old_job_status );
    if ( job_status != NULL && job_status[0] != '\0' ) {
        new_job_status = job_status;
    }
    if ( old_job_status == new_job_status ) {
        return false;
    }
    if ( !old_job_status.empty() ) {
        jobAd->AssignExpr( ATTR_GRID_JOB_STATUS, "Undefined" );
    }
    if ( !new_job_status.empty() ) {
        jobAd->Assign( ATTR_GRID_JOB_STATUS, new_job_status.c_str() );
    }
    requestScheddUpdate( this, false );
    return true;
}
Esempio n. 2
0
void BaseJob::SetRemoteJobId( const char *job_id )
{
    std::string old_job_id;
    std::string new_job_id;
    jobAd->LookupString( ATTR_GRID_JOB_ID, old_job_id );
    if ( job_id != NULL && job_id[0] != '\0' ) {
        new_job_id = job_id;
    }
    if ( old_job_id == new_job_id ) {
        return;
    }
    if ( !old_job_id.empty() ) {
        JobsByRemoteId.remove( HashKey( old_job_id.c_str() ) );
        jobAd->AssignExpr( ATTR_GRID_JOB_ID, "Undefined" );
    } else {
        //  old job id was NULL
        m_lastRemoteStatusUpdate = time(NULL);
        jobAd->Assign( ATTR_LAST_REMOTE_STATUS_UPDATE, m_lastRemoteStatusUpdate );
    }
    if ( !new_job_id.empty() ) {
        JobsByRemoteId.insert( HashKey( new_job_id.c_str() ), this );
        jobAd->Assign( ATTR_GRID_JOB_ID, new_job_id.c_str() );
    } else {
        // new job id is NULL
        m_lastRemoteStatusUpdate = 0;
        jobAd->Assign( ATTR_LAST_REMOTE_STATUS_UPDATE, 0 );
        m_currentStatusUnknown = false;
        jobAd->Assign( ATTR_CURRENT_STATUS_UNKNOWN, false );
    }
    requestScheddUpdate( this, false );
}
Esempio n. 3
0
void BaseJob::DoneWithJob()
{
    deleteFromGridmanager = true;

    switch(condorState) {
    case COMPLETED:
    {
        // I never want to see this job again.
        jobAd->Assign( ATTR_JOB_MANAGED, MANAGED_DONE );
        if ( writeUserLog && !terminateLogged ) {
            WriteTerminateEventToUserLog( jobAd );
            EmailTerminateEvent( jobAd, exitStatusKnown );
            terminateLogged = true;
        }
        deleteFromSchedd = true;
    }
    break;

    case REMOVED:
    {
        // I never want to see this job again.
        jobAd->Assign( ATTR_JOB_MANAGED, MANAGED_DONE );
        if ( writeUserLog && !abortLogged ) {
            WriteAbortEventToUserLog( jobAd );
            abortLogged = true;
        }
        deleteFromSchedd = true;
    }
    break;

    case HELD:
    {
        jobAd->Assign( ATTR_JOB_MANAGED, MANAGED_SCHEDD );
        if ( writeUserLog && !holdLogged ) {
            WriteHoldEventToUserLog( jobAd );
            holdLogged = true;
        }
    }
    break;

    case IDLE:
    {
        jobAd->Assign( ATTR_JOB_MANAGED, MANAGED_SCHEDD );
    }
    break;

    default:
    {
        EXCEPT("BaseJob::DoneWithJob called with unexpected state %s (%d)", getJobStatusString(condorState), condorState);
    }
    break;
    }

    requestScheddUpdate( this, false );
}
Esempio n. 4
0
void BaseJob::JobRemoved( const char *remove_reason )
{
    if ( condorState != REMOVED ) {
        condorState = REMOVED;
        jobAd->Assign( ATTR_JOB_STATUS, condorState );
        jobAd->Assign( ATTR_ENTERED_CURRENT_STATUS, (int)time(NULL) );

        jobAd->Assign( ATTR_REMOVE_REASON, remove_reason );

        UpdateRuntimeStats();

        requestScheddUpdate( this, false );
    }
}
Esempio n. 5
0
void BaseJob::JobCompleted()
{
    if ( condorState != COMPLETED && condorState != HELD &&
            condorState != REMOVED ) {

        condorState = COMPLETED;
        jobAd->Assign( ATTR_JOB_STATUS, condorState );
        jobAd->Assign( ATTR_ENTERED_CURRENT_STATUS, (int)time(NULL) );

        UpdateRuntimeStats();

        requestScheddUpdate( this, false );
    }
}
Esempio n. 6
0
void BaseJob::UpdateRuntimeStats()
{
    if ( calcRuntimeStats == false ) {
        return;
    }

    // Adjust run time for condor_q
    int shadowBirthdate = 0;
    jobAd->LookupInteger( ATTR_SHADOW_BIRTHDATE, shadowBirthdate );
    if ( condorState == RUNNING && shadowBirthdate == 0 ) {

        // The job has started a new interval of running
        int current_time = (int)time(NULL);
        jobAd->Assign( ATTR_SHADOW_BIRTHDATE, current_time );

        int num_job_starts = 0;
        jobAd->LookupInteger( ATTR_NUM_JOB_STARTS, num_job_starts );
        num_job_starts++;
        jobAd->Assign( ATTR_NUM_JOB_STARTS, num_job_starts );

        requestScheddUpdate( this, false );

    } else if ( condorState != RUNNING && shadowBirthdate != 0 ) {

        // The job has stopped an interval of running, add the current
        // interval to the accumulated total run time
        float accum_time = 0;
        jobAd->LookupFloat( ATTR_JOB_REMOTE_WALL_CLOCK, accum_time );
        accum_time += (float)( time(NULL) - shadowBirthdate );
        jobAd->Assign( ATTR_JOB_REMOTE_WALL_CLOCK, accum_time );
        jobAd->Assign( ATTR_JOB_WALL_CLOCK_CKPT,(char *)NULL );
        jobAd->AssignExpr( ATTR_SHADOW_BIRTHDATE, "UNDEFINED" );

        requestScheddUpdate( this, false );

    }
}
Esempio n. 7
0
void BaseJob::JobHeld( const char *hold_reason, int hold_code,
                       int hold_sub_code )
{
    bool nonessential = false;

    jobAd->LookupBool(ATTR_JOB_NONESSENTIAL,nonessential);
    if ( nonessential ) {
        // don't have the gridmanager put a job on  hold if
        // the job is nonessential.  instead, just remove it.
        JobRemoved(hold_reason);
        return;
    }
    if ( condorState != HELD ) {
        // if the job was in REMOVED state, make certain we return
        // to the removed state when it is released.
        if ( condorState == REMOVED ) {
            jobAd->Assign( ATTR_JOB_STATUS_ON_RELEASE, REMOVED );
        }
        condorState = HELD;
        jobAd->Assign( ATTR_JOB_STATUS, condorState );
        jobAd->Assign( ATTR_ENTERED_CURRENT_STATUS, (int)time(NULL) );

        jobAd->Assign( ATTR_HOLD_REASON, hold_reason );
        jobAd->Assign(ATTR_HOLD_REASON_CODE, hold_code);
        jobAd->Assign(ATTR_HOLD_REASON_SUBCODE, hold_sub_code);

        char *release_reason;
        if ( jobAd->LookupString( ATTR_RELEASE_REASON, &release_reason ) != 0 ) {
            jobAd->Assign( ATTR_LAST_RELEASE_REASON, release_reason );
            free( release_reason );
        }
        jobAd->AssignExpr( ATTR_RELEASE_REASON, "Undefined" );

        int num_holds;
        jobAd->LookupInteger( ATTR_NUM_SYSTEM_HOLDS, num_holds );
        num_holds++;
        jobAd->Assign( ATTR_NUM_SYSTEM_HOLDS, num_holds );

        UpdateRuntimeStats();

        if ( writeUserLog && !holdLogged ) {
            WriteHoldEventToUserLog( jobAd );
            holdLogged = true;
        }

        requestScheddUpdate( this, false );
    }
}
Esempio n. 8
0
void BaseJob::NotifyResourceUp()
{
    resourceStateKnown = true;
    if ( resourceDown == true ) {
        // The GlobusResourceUp event is now deprecated
        WriteGlobusResourceUpEventToUserLog( jobAd );
        WriteGridResourceUpEventToUserLog( jobAd );
        jobAd->AssignExpr( ATTR_GRID_RESOURCE_UNAVAILABLE_TIME, "Undefined" );
        requestScheddUpdate( this, false );
    }
    resourceDown = false;
    if ( resourcePingPending ) {
        resourcePingPending = false;
        resourcePingComplete = true;
    }
    SetEvaluateState();
}
Esempio n. 9
0
void BaseJob::CheckRemoteStatus()
{
    const int stale_limit = 15*60;

    // TODO return time that this job status could become stale?
    // TODO compute stale_limit from job's poll interval?
    // TODO make stale_limit configurable?
    if ( m_lastRemoteStatusUpdate == 0 || m_currentStatusUnknown == true ) {
        return;
    }
    if ( time(NULL) > m_lastRemoteStatusUpdate + stale_limit ) {
        m_currentStatusUnknown = true;
        jobAd->Assign( ATTR_CURRENT_STATUS_UNKNOWN, true );
        requestScheddUpdate( this, false );
        WriteJobStatusUnknownEventToUserLog( jobAd );
        SetEvaluateState();
    }
}
Esempio n. 10
0
void BaseJob::JobRunning()
{
    if ( condorState != RUNNING && condorState != HELD &&
            condorState != REMOVED ) {

        condorState = RUNNING;
        jobAd->Assign( ATTR_JOB_STATUS, condorState );
        jobAd->Assign( ATTR_ENTERED_CURRENT_STATUS, (int)time(NULL) );

        UpdateRuntimeStats();

        if ( writeUserLog && !executeLogged ) {
            WriteExecuteEventToUserLog( jobAd );
            executeLogged = true;
        }

        requestScheddUpdate( this, false );
    }
}
Esempio n. 11
0
void BaseJob::JobLeaseReceivedExpired()
{
    dprintf(D_FULLDEBUG,"(%d.%d) BaseJob::JobLeaseReceivedExpired()\n",procID.cluster,procID.proc);
    if ( jobLeaseReceivedExpiredTid != TIMER_UNSET ) {
        daemonCore->Cancel_Timer( jobLeaseReceivedExpiredTid );
        jobLeaseReceivedExpiredTid = TIMER_UNSET;
    }

    condorState = REMOVED;
    jobAd->Assign( ATTR_JOB_STATUS, condorState );
    jobAd->Assign( ATTR_ENTERED_CURRENT_STATUS, (int)time(NULL) );

    jobAd->Assign( ATTR_REMOVE_REASON, "Job lease expired" );

    UpdateRuntimeStats();

    requestScheddUpdate( this, false );

    SetEvaluateState();
}
Esempio n. 12
0
void BaseJob::JobIdle()
{
    if ( condorState != IDLE && condorState != HELD &&
            condorState != REMOVED ) {

        bool write_evict = (condorState==RUNNING);

        condorState = IDLE;
        jobAd->Assign( ATTR_JOB_STATUS, condorState );
        jobAd->Assign( ATTR_ENTERED_CURRENT_STATUS, (int)time(NULL) );

        UpdateRuntimeStats();

        if( write_evict ) {
            WriteEvictEventToUserLog( jobAd );
            executeLogged = false;
        }

        requestScheddUpdate( this, false );
    }
}
Esempio n. 13
0
void BaseJob::UpdateJobLeaseSent( int new_expiration_time )
{
    dprintf(D_FULLDEBUG,"(%d.%d) UpdateJobLeaseSent(%d)\n",procID.cluster,procID.proc,(int)new_expiration_time);
    int old_expiration_time = TIMER_UNSET;

    jobAd->LookupInteger( ATTR_JOB_LEASE_EXPIRATION,
                          old_expiration_time );

    if ( new_expiration_time <= 0 ) {
        new_expiration_time = TIMER_UNSET;
    }

    if ( new_expiration_time != old_expiration_time ) {

        if ( new_expiration_time == TIMER_UNSET ) {
            jobAd->AssignExpr( ATTR_JOB_LEASE_EXPIRATION, "Undefined" );
        } else {
            jobAd->Assign( ATTR_JOB_LEASE_EXPIRATION,
                           new_expiration_time );
        }

        if ( old_expiration_time == TIMER_UNSET ||
                ( new_expiration_time != TIMER_UNSET &&
                  new_expiration_time < old_expiration_time ) ) {

            BaseResource *resource = GetResource();
            if ( resource ) {
                resource->RequestUpdateLeases();
            }
        }

        requestScheddUpdate( this, false );

        SetJobLeaseTimers();
    }
}
Esempio n. 14
0
void BaseResource::UpdateLeases()
{
dprintf(D_FULLDEBUG,"*** UpdateLeases called\n");
	if ( hasLeases == false ) {
dprintf(D_FULLDEBUG,"    Leases not supported, cancelling timer\n" );
		daemonCore->Cancel_Timer( updateLeasesTimerId );
		updateLeasesTimerId = TIMER_UNSET;
		return;
	}

	// Don't start a new lease update too soon after the previous one.
	int delay;
	delay = (lastUpdateLeases + UPDATE_LEASE_DELAY) - time(NULL);
	if ( delay > 0 ) {
		daemonCore->Reset_Timer( updateLeasesTimerId, delay );
dprintf(D_FULLDEBUG,"    UpdateLeases: last update too recent, delaying %d secs\n",delay);
		return;
	}

	daemonCore->Reset_Timer( updateLeasesTimerId, TIMER_NEVER );

    if ( updateLeasesActive == false ) {
		BaseJob *curr_job;
		time_t next_renew_time = INT_MAX;
		time_t job_renew_time;
		int min_new_expire = INT_MAX;
dprintf(D_FULLDEBUG,"    UpdateLeases: calc'ing new leases\n");
		registeredJobs.Rewind();
dprintf(D_FULLDEBUG,"    starting min_new_expire=%d next_renew_time=%ld\n",min_new_expire,next_renew_time);
		while ( registeredJobs.Next( curr_job ) ) {
			int new_expire;
			std::string  job_id;
			job_renew_time = next_renew_time;
				// Don't update the lease for a job that isn't submitted
				// anywhere. The Job object will start the lease when it
				// submits the job.
			if ( ( m_hasSharedLeases || curr_job->jobAd->LookupString( ATTR_GRID_JOB_ID, job_id ) ) &&
				 CalculateJobLease( curr_job->jobAd, new_expire,
									m_defaultLeaseDuration,
									&job_renew_time ) ) {

				if ( new_expire < min_new_expire ) {
					min_new_expire = new_expire;
				}
				if ( !m_hasSharedLeases ) {
					curr_job->UpdateJobLeaseSent( new_expire );
					leaseUpdates.Append( curr_job );
				}
			} else if ( job_renew_time < next_renew_time ) {
				next_renew_time = job_renew_time;
			}
dprintf(D_FULLDEBUG,"    after %d.%d: min_new_expire=%d next_renew_time=%ld job_renew_time=%ld\n",curr_job->procID.cluster,curr_job->procID.proc,min_new_expire,next_renew_time,job_renew_time);
		}
		if ( min_new_expire == INT_MAX ||
			 ( m_hasSharedLeases && next_renew_time < INT_MAX &&
			   m_sharedLeaseExpiration != 0 ) ) {
			if ( next_renew_time > time(NULL) + 3600 ) {
				next_renew_time = time(NULL) + 3600;
			}
dprintf(D_FULLDEBUG,"    UpdateLeases: nothing to renew, resetting timer for %ld secs\n",next_renew_time - time(NULL));
			lastUpdateLeases = time(NULL);
			daemonCore->Reset_Timer( updateLeasesTimerId,
									 next_renew_time - time(NULL) );
		} else {
			if ( m_hasSharedLeases ) {
				registeredJobs.Rewind();
				while ( registeredJobs.Next( curr_job ) ) {
					std::string job_id;
					if ( curr_job->jobAd->LookupString( ATTR_GRID_JOB_ID, job_id ) ) {
						curr_job->UpdateJobLeaseSent( min_new_expire );
					}
				}
				m_sharedLeaseExpiration = min_new_expire;
dprintf(D_FULLDEBUG,"    new shared lease expiration at %ld, updating job ads...\n",m_sharedLeaseExpiration);
			}
			requestScheddUpdateNotification( updateLeasesTimerId );
			updateLeasesActive = true;
			leaseAttrsSynched = false;
		}
		return;
	}

	if ( leaseAttrsSynched == false ) {
		bool still_dirty = false;
		BaseJob *curr_job;
		leaseUpdates.Rewind();
		while ( leaseUpdates.Next( curr_job ) ) {
			bool exists, dirty;
			curr_job->jobAd->GetDirtyFlag( ATTR_JOB_LEASE_EXPIRATION,
										   &exists, &dirty );
			if ( !exists ) {
					// What!? The attribute disappeared? Forget about renewing
					// the lease then
				dprintf( D_ALWAYS, "Lease attribute disappeared for job %d.%d, ignoring it\n",
						 curr_job->procID.cluster, curr_job->procID.proc );
				leaseUpdates.DeleteCurrent();
			}
			if ( dirty ) {
				still_dirty = true;
				requestScheddUpdate( curr_job, false );
			}
		}
		if ( still_dirty ) {
			requestScheddUpdateNotification( updateLeasesTimerId );
dprintf(D_FULLDEBUG,"    UpdateLeases: waiting for schedd synch\n");
			return;
		}
else dprintf(D_FULLDEBUG,"    UpdateLeases: leases synched\n");
	}

	leaseAttrsSynched = true;

	unsigned update_delay = 0;
	bool update_complete;
	SimpleList<PROC_ID> update_succeeded;
	bool update_success;
dprintf(D_FULLDEBUG,"    UpdateLeases: calling DoUpdateLeases\n");
	if ( m_hasSharedLeases ) {
		DoUpdateSharedLease( update_delay, update_complete, update_success );
	} else {
		DoUpdateLeases( update_delay, update_complete, update_succeeded );
	}

	if ( update_delay ) {
		daemonCore->Reset_Timer( updateLeasesTimerId, update_delay );
dprintf(D_FULLDEBUG,"    UpdateLeases: DoUpdateLeases wants delay of %uld secs\n",update_delay);
		return;
	}

	if ( !update_complete ) {
		updateLeasesCmdActive = true;
dprintf(D_FULLDEBUG,"    UpdateLeases: DoUpdateLeases in progress\n");
		return;
	}

dprintf(D_FULLDEBUG,"    UpdateLeases: DoUpdateLeases complete, processing results\n");
	bool first_update = lastUpdateLeases == 0;

	updateLeasesCmdActive = false;
	lastUpdateLeases = time(NULL);

	if ( m_hasSharedLeases ) {
		BaseJob *curr_job;
		std::string tmp;
		registeredJobs.Rewind();
		while ( registeredJobs.Next( curr_job ) ) {
			if ( first_update ) {
				// New jobs may be waiting for the lease be to established
				// before they proceed with submission.
				curr_job->SetEvaluateState();
			}
			if ( !curr_job->jobAd->LookupString( ATTR_GRID_JOB_ID, tmp ) ) {
				continue;
			}
			bool curr_renewal_failed = !update_success;
			bool last_renewal_failed = false;
			curr_job->jobAd->LookupBool( ATTR_LAST_JOB_LEASE_RENEWAL_FAILED,
										 last_renewal_failed );
			if ( curr_renewal_failed != last_renewal_failed ) {
				curr_job->jobAd->Assign( ATTR_LAST_JOB_LEASE_RENEWAL_FAILED,
										 curr_renewal_failed );
				requestScheddUpdate( curr_job, false );
			}
		}
	} else {
update_succeeded.Rewind();
PROC_ID id;
std::string msg = "    update_succeeded:";
 while(update_succeeded.Next(id)) formatstr_cat(msg, " %d.%d", id.cluster, id.proc);
dprintf(D_FULLDEBUG,"%s\n",msg.c_str());
		BaseJob *curr_job;
		leaseUpdates.Rewind();
		while ( leaseUpdates.Next( curr_job ) ) {
			bool curr_renewal_failed;
			bool last_renewal_failed = false;
			if ( update_succeeded.IsMember( curr_job->procID ) ) {
dprintf(D_FULLDEBUG,"    %d.%d is in succeeded list\n",curr_job->procID.cluster,curr_job->procID.proc);
				curr_renewal_failed = false;
			} else {
dprintf(D_FULLDEBUG,"    %d.%d is not in succeeded list\n",curr_job->procID.cluster,curr_job->procID.proc);
				curr_renewal_failed = true;
			}
			curr_job->jobAd->LookupBool( ATTR_LAST_JOB_LEASE_RENEWAL_FAILED,
										 last_renewal_failed );
			if ( curr_renewal_failed != last_renewal_failed ) {
				curr_job->jobAd->Assign( ATTR_LAST_JOB_LEASE_RENEWAL_FAILED,
										 curr_renewal_failed );
				requestScheddUpdate( curr_job, false );
			}
			leaseUpdates.DeleteCurrent();
		}
	}

	updateLeasesActive = false;

dprintf(D_FULLDEBUG,"    UpdateLeases: lease update complete, resetting timer for 30 secs\n");
	daemonCore->Reset_Timer( updateLeasesTimerId, UPDATE_LEASE_DELAY );
}
Esempio n. 15
0
void UnicoreJob::doEvaluateState()
{
	int old_gm_state;
	int old_unicore_state;
	bool reevaluate_state = true;
	time_t now = time(NULL);

	bool attr_exists;
	bool attr_dirty;
	int rc;

	daemonCore->Reset_Timer( evaluateStateTid, TIMER_NEVER );

    dprintf(D_ALWAYS,
			"(%d.%d) doEvaluateState called: gmState %s, unicoreState %d\n",
			procID.cluster,procID.proc,GMStateNames[gmState],unicoreState);

	if ( gahp ) {
		gahp->setMode( GahpClient::normal );
	}

	do {
		reevaluate_state = false;
		old_gm_state = gmState;
		old_unicore_state = unicoreState;
		ASSERT ( gahp != NULL || gmState == GM_HOLD || gmState == GM_DELETE );

		switch ( gmState ) {
		case GM_INIT: {
			// This is the state all jobs start in when the GlobusJob object
			// is first created. Here, we do things that we didn't want to
			// do in the constructor because they could block (the
			// constructor is called while we're connected to the schedd).
			if ( gahp->Startup() == false ) {
				dprintf( D_ALWAYS, "(%d.%d) Error starting up GAHP\n",
						 procID.cluster, procID.proc );
				
				jobAd->Assign( ATTR_HOLD_REASON, "Failed to start GAHP" );
				gmState = GM_HOLD;
				break;
			}

			GahpClient::mode saved_mode = gahp->getMode();
			gahp->setMode( GahpClient::blocking );

			rc = gahp->unicore_job_callback( UnicoreGahpCallbackHandler );
			if ( rc != GLOBUS_SUCCESS ) {
				dprintf( D_ALWAYS,
						 "(%d.%d) Error enabling unicore callback, err=%d\n", 
						 procID.cluster, procID.proc, rc );
				jobAd->Assign( ATTR_HOLD_REASON, "Failed to initialize GAHP" );
				gmState = GM_HOLD;
				break;
			}

			gahp->setMode( saved_mode );

			gmState = GM_START;
			} break;
		case GM_START: {
			// This state is the real start of the state machine, after
			// one-time initialization has been taken care of.
			// If we think there's a running jobmanager
			// out there, we try to register for callbacks (in GM_REGISTER).
			// The one way jobs can end up back in this state is if we
			// attempt a restart of a jobmanager only to be told that the
			// old jobmanager process is still alive.
			errorString = "";
			if ( jobContact == NULL ) {
				gmState = GM_CLEAR_REQUEST;
			} else if ( wantResubmit || doResubmit ) {
				gmState = GM_CLEAR_REQUEST;
			} else {
				if ( condorState == RUNNING ) {
					executeLogged = true;
				}

				gmState = GM_RECOVER;
			}
			} break;
		case GM_RECOVER: {
			// We're recovering from a crash after the job was submitted.
			// Allow the gahp server to recover its internal state about
			// the job.
			if ( submitAd == NULL ) {
				submitAd = buildSubmitAd();
			}
			if ( submitAd == NULL ) {
				gmState = GM_HOLD;
				break;
			}
			rc = gahp->unicore_job_recover( submitAd->c_str() );
			if ( rc == GAHPCLIENT_COMMAND_NOT_SUBMITTED ||
				 rc == GAHPCLIENT_COMMAND_PENDING ) {
				break;
			}
			if ( rc != GLOBUS_SUCCESS ) {
				// unhandled error
				dprintf(D_ALWAYS,"(%d.%d) unicore_job_recover() failed\n",
						procID.cluster, procID.proc);
				gmState = GM_CANCEL;
				break;
			}
			gmState = GM_SUBMITTED;
		} break;
		case GM_UNSUBMITTED: {
			// There are no outstanding gram submissions for this job (if
			// there is one, we've given up on it).
			if ( condorState == REMOVED ) {
				gmState = GM_DELETE;
			} else if ( condorState == HELD ) {
				gmState = GM_DELETE;
				break;
			} else {
				gmState = GM_SUBMIT;
			}
			} break;
		case GM_SUBMIT: {
			// Start a new gram submission for this job.
			char *job_contact = NULL;
			if ( condorState == REMOVED || condorState == HELD ) {
				gmState = GM_UNSUBMITTED;
				break;
			}
			if ( numSubmitAttempts >= MAX_SUBMIT_ATTEMPTS ) {
				jobAd->Assign( ATTR_HOLD_REASON,
							   "Attempts to submit failed" );
				gmState = GM_HOLD;
				break;
			}
			// After a submit, wait at least submitInterval before trying
			// another one.
			if ( now >= lastSubmitAttempt + submitInterval ) {
				if ( submitAd == NULL ) {
					submitAd = buildSubmitAd();
				}
				if ( submitAd == NULL ) {
					gmState = GM_HOLD;
					break;
				}
				rc = gahp->unicore_job_create( submitAd->c_str(),
											   &job_contact );
				if ( rc == GAHPCLIENT_COMMAND_NOT_SUBMITTED ||
					 rc == GAHPCLIENT_COMMAND_PENDING ) {
					break;
				}
				lastSubmitAttempt = time(NULL);
				numSubmitAttempts++;
				if ( rc == GLOBUS_SUCCESS ) {
						// job_contact was strdup()ed for us. Now we take
						// responsibility for free()ing it.
					SetRemoteJobId( job_contact );
					free( job_contact );
					WriteGridSubmitEventToUserLog( jobAd );
					gmState = GM_SUBMIT_SAVE;
				} else {
					// unhandled error
					dprintf(D_ALWAYS,"(%d.%d) unicore_job_create() failed\n",
							procID.cluster, procID.proc);
					dprintf(D_ALWAYS,"(%d.%d)    submitAd='%s'\n",
							procID.cluster, procID.proc,submitAd->c_str());
					if ( job_contact ) {
						free( job_contact );
					}
					gmState = GM_UNSUBMITTED;
					reevaluate_state = true;
				}
			} else if ( condorState == REMOVED || condorState == HELD ) {
				gmState = GM_UNSUBMITTED;
			} else {
				unsigned int delay = 0;
				if ( (lastSubmitAttempt + submitInterval) > now ) {
					delay = (lastSubmitAttempt + submitInterval) - now;
				}				
				daemonCore->Reset_Timer( evaluateStateTid, delay );
			}
			} break;
		case GM_SUBMIT_SAVE: {
			// Save the jobmanager's contact for a new gram submission.
			if ( condorState == REMOVED || condorState == HELD ) {
				gmState = GM_CANCEL;
			} else {
				jobAd->GetDirtyFlag( ATTR_GRID_JOB_ID, &attr_exists, &attr_dirty );
				if ( attr_exists && attr_dirty ) {
					requestScheddUpdate( this, true );
					break;
				}
				gmState = GM_SUBMIT_COMMIT;
			}
			} break;
		case GM_SUBMIT_COMMIT: {
			// Now that we've saved the jobmanager's contact, commit the
			// gram job submission.
			if ( condorState == REMOVED || condorState == HELD ) {
				gmState = GM_CANCEL;
			} else {
				rc = gahp->unicore_job_start( jobContact );
				if ( rc == GAHPCLIENT_COMMAND_NOT_SUBMITTED ||
					 rc == GAHPCLIENT_COMMAND_PENDING ) {
					break;
				}
				if ( rc != GLOBUS_SUCCESS ) {
					// unhandled error
					dprintf(D_ALWAYS,"(%d.%d) unicore_job_start() failed\n",
							procID.cluster, procID.proc);
					gmState = GM_CANCEL;
				} else {
					gmState = GM_SUBMITTED;
				}
			}
			} break;
		case GM_SUBMITTED: {
			// The job has been submitted (or is about to be by the
			// jobmanager). Wait for completion or failure, and probe the
			// jobmanager occassionally to make it's still alive.
			if ( unicoreState == COMPLETED ) {
				gmState = GM_DONE_SAVE;
//			} else if ( unicoreState == GLOBUS_GRAM_PROTOCOL_JOB_STATE_FAILED ) {
//				gmState = GM_CANCEL;
			} else if ( condorState == REMOVED || condorState == HELD ) {
				gmState = GM_CANCEL;
			} else if ( newRemoteStatusAd ) {
dprintf(D_FULLDEBUG,"(%d.%d) *** Processing callback ad\n",procID.cluster, procID.proc );
				lastProbeTime = now;
				UpdateUnicoreState( newRemoteStatusAd );
				delete newRemoteStatusAd;
				newRemoteStatusAd = NULL;
				reevaluate_state = true;
/* Now that the gahp tells us when a job status changes, we don't need to
 * do active probes.
			} else {
				if ( lastProbeTime < enteredCurrentGmState ) {
					lastProbeTime = enteredCurrentGmState;
				}
				if ( probeNow ) {
					lastProbeTime = 0;
					probeNow = false;
				}
				if ( now >= lastProbeTime + probeInterval ) {
					gmState = GM_PROBE_JOBMANAGER;
					break;
				}
				unsigned int delay = 0;
				if ( (lastProbeTime + probeInterval) > now ) {
					delay = (lastProbeTime + probeInterval) - now;
				}				
				daemonCore->Reset_Timer( evaluateStateTid, delay );
*/
			}
			} break;
		case GM_PROBE_JOBMANAGER: {
			if ( condorState == REMOVED || condorState == HELD ) {
				gmState = GM_CANCEL;
			} else {
				char *status_ad = NULL;
				rc = gahp->unicore_job_status( jobContact,
											   &status_ad );
				if ( rc == GAHPCLIENT_COMMAND_NOT_SUBMITTED ||
					 rc == GAHPCLIENT_COMMAND_PENDING ) {
					break;
				}
				if ( rc != GLOBUS_SUCCESS ) {
					// unhandled error
					dprintf(D_ALWAYS,"(%d.%d) unicore_job_status() failed\n",
							procID.cluster, procID.proc);
					if ( status_ad ) {
						free( status_ad );
					}
					gmState = GM_CANCEL;
					break;
				}
				UpdateUnicoreState( status_ad );
				if ( status_ad ) {
					free( status_ad );
				}
				if ( newRemoteStatusAd ) {
					delete newRemoteStatusAd;
					newRemoteStatusAd = NULL;
				}
				lastProbeTime = now;
				gmState = GM_SUBMITTED;
			}
			} break;
		case GM_DONE_SAVE: {
			// Report job completion to the schedd.
			JobTerminated();
			if ( condorState == COMPLETED ) {
				jobAd->GetDirtyFlag( ATTR_JOB_STATUS, &attr_exists, &attr_dirty );
				if ( attr_exists && attr_dirty ) {
					requestScheddUpdate( this, true );
					break;
				}
			}
			gmState = GM_DONE_COMMIT;
			} break;
		case GM_DONE_COMMIT: {
			// Tell the jobmanager it can clean up and exit.
			rc = gahp->unicore_job_destroy( jobContact );
			if ( rc == GAHPCLIENT_COMMAND_NOT_SUBMITTED ||
				 rc == GAHPCLIENT_COMMAND_PENDING ) {
				break;
			}
			if ( rc != GLOBUS_SUCCESS ) {
				// unhandled error
				dprintf(D_ALWAYS,"(%d.%d) unicore_job_destroy() failed\n",
						procID.cluster, procID.proc);
				gmState = GM_CANCEL;
				break;
			}
			if ( condorState == COMPLETED || condorState == REMOVED ) {
				gmState = GM_DELETE;
			} else {
				// Clear the contact string here because it may not get
				// cleared in GM_CLEAR_REQUEST (it might go to GM_HOLD first).
//				SetRemoteJobId( NULL );
//				gmState = GM_CLEAR_REQUEST;
				gmState = GM_HOLD;
			}
			} break;
		case GM_CANCEL: {
			// We need to cancel the job submission.
//			if ( unicoreState != COMPLETED &&
//				 unicoreState != GLOBUS_GRAM_PROTOCOL_JOB_STATE_FAILED ) {
if ( unicoreState != COMPLETED ) {
				rc = gahp->unicore_job_destroy( jobContact );
				if ( rc == GAHPCLIENT_COMMAND_NOT_SUBMITTED ||
					 rc == GAHPCLIENT_COMMAND_PENDING ) {
					break;
				}
				if ( rc != GLOBUS_SUCCESS ) {
					// unhandled error
					dprintf(D_ALWAYS,"(%d.%d) unicore_job_destroy() failed\n",
							procID.cluster, procID.proc);
					gmState = GM_HOLD;
					break;
				}
			}
			if ( condorState == REMOVED ) {
				gmState = GM_DELETE;
			} else {
				gmState = GM_CLEAR_REQUEST;
			}
			} break;
		case GM_DELETE: {
			// We are done with the job. Propagate any remaining updates
			// to the schedd, then delete this object.
			DoneWithJob();
			// This object will be deleted when the update occurs
			} break;
		case GM_CLEAR_REQUEST: {
			// Remove all knowledge of any previous or present job
			// submission, in both the gridmanager and the schedd.

			errorString = "";
			SetRemoteJobId( NULL );
			SetRemoteJobStatus( NULL );
			JobIdle();

			// If there are no updates to be done when we first enter this
			// state, requestScheddUpdate will return done immediately
			// and not waste time with a needless connection to the
			// schedd. If updates need to be made, they won't show up in
			// schedd_actions after the first pass through this state
			// because we modified our local variables the first time
			// through. However, since we registered update events the
			// first time, requestScheddUpdate won't return done until
			// they've been committed to the schedd.
			const char *name;
			ExprTree *expr;
			jobAd->ResetExpr();
			if ( jobAd->NextDirtyExpr(name, expr) ) {
				requestScheddUpdate( this, true );
				break;
			}
			executeLogged = false;
			terminateLogged = false;
			abortLogged = false;
			evictLogged = false;
			gmState = GM_UNSUBMITTED;
			} break;
		case GM_HOLD: {
			// Put the job on hold in the schedd.
			// TODO: what happens if we learn here that the job is removed?
//			if ( jobContact &&
//				 unicoreState != GLOBUS_GRAM_PROTOCOL_JOB_STATE_UNKNOWN ) {
//				unicoreState = GLOBUS_GRAM_PROTOCOL_JOB_STATE_UNKNOWN;
//			}
			// If the condor state is already HELD, then someone already
			// HELD it, so don't update anything else.
			if ( condorState != HELD ) {

				// Set the hold reason as best we can
				// TODO: set the hold reason in a more robust way.
				char holdReason[1024];
				holdReason[0] = '\0';
				holdReason[sizeof(holdReason)-1] = '\0';
				jobAd->LookupString( ATTR_HOLD_REASON, holdReason,
									 sizeof(holdReason) );
				if ( holdReason[0] == '\0' && errorString != "" ) {
					strncpy( holdReason, errorString.c_str(),
							 sizeof(holdReason) - 1 );
				}
				if ( holdReason[0] == '\0' ) {
					strncpy( holdReason, "Unspecified gridmanager error",
							 sizeof(holdReason) - 1 );
				}

				JobHeld( holdReason );
			}
			gmState = GM_DELETE;
			} break;
		default:
			EXCEPT( "(%d.%d) Unknown gmState %d!", procID.cluster,procID.proc,
					gmState );
		}

		if ( gmState != old_gm_state || unicoreState != old_unicore_state ) {
			reevaluate_state = true;
		}
		if ( unicoreState != old_unicore_state ) {
//			dprintf(D_FULLDEBUG, "(%d.%d) globus state change: %s -> %s\n",
//					procID.cluster, procID.proc,
//					GlobusJobStatusName(old_globus_state),
//					GlobusJobStatusName(globusState));
			enteredCurrentUnicoreState = time(NULL);
		}
		if ( gmState != old_gm_state ) {
			dprintf(D_FULLDEBUG, "(%d.%d) gm state change: %s -> %s\n",
					procID.cluster, procID.proc, GMStateNames[old_gm_state],
					GMStateNames[gmState]);
			enteredCurrentGmState = time(NULL);
			// If we were waiting for a pending unicore call, we're not
			// anymore so purge it.
			if ( gahp ) {
				gahp->purgePendingRequests();
			}
			// If we were calling unicore_job_create and using submitAd,
			// we're done with it now, so free it.
			if ( submitAd ) {
				delete submitAd;
				submitAd = NULL;
			}
		}

	} while ( reevaluate_state );
}
Esempio n. 16
0
void NordugridJob::doEvaluateState()
{
	int old_gm_state;
	bool reevaluate_state = true;
	time_t now = time(NULL);

	bool attr_exists;
	bool attr_dirty;
	int rc;

	daemonCore->Reset_Timer( evaluateStateTid, TIMER_NEVER );

    dprintf(D_ALWAYS,
			"(%d.%d) doEvaluateState called: gmState %s, condorState %d\n",
			procID.cluster,procID.proc,GMStateNames[gmState],condorState);

	if ( gahp ) {
		if ( !resourceStateKnown || resourcePingPending || resourceDown ) {
			gahp->setMode( GahpClient::results_only );
		} else {
			gahp->setMode( GahpClient::normal );
		}
	}

	do {
		reevaluate_state = false;
		old_gm_state = gmState;

		switch ( gmState ) {
		case GM_INIT: {
			// This is the state all jobs start in when the GlobusJob object
			// is first created. Here, we do things that we didn't want to
			// do in the constructor because they could block (the
			// constructor is called while we're connected to the schedd).
			if ( gahp->Startup() == false ) {
				dprintf( D_ALWAYS, "(%d.%d) Error starting GAHP\n",
						 procID.cluster, procID.proc );

				jobAd->Assign( ATTR_HOLD_REASON, "Failed to start GAHP" );
				gmState = GM_HOLD;
				break;
			}
			if ( gahp->Initialize( jobProxy ) == false ) {
				dprintf( D_ALWAYS, "(%d.%d) Error initializing GAHP\n",
						 procID.cluster, procID.proc );

				jobAd->Assign( ATTR_HOLD_REASON,
							   "Failed to initialize GAHP" );
				gmState = GM_HOLD;
				break;
			}

			gahp->setDelegProxy( jobProxy );

			gmState = GM_START;
			} break;
		case GM_START: {
			errorString = "";
			if ( remoteJobId == NULL ) {
				gmState = GM_CLEAR_REQUEST;
			} else {
				submitLogged = true;
				if ( condorState == RUNNING ||
					 condorState == COMPLETED ) {
					executeLogged = true;
				}

				if ( remoteJobState == "" ||
					 remoteJobState == REMOTE_STATE_ACCEPTING ||
					 remoteJobState == REMOTE_STATE_ACCEPTED ||
					 remoteJobState == REMOTE_STATE_PREPARING ) {
					gmState = GM_RECOVER_QUERY;
				} else {
					gmState = GM_SUBMITTED;
				}
			}
			} break;
		case GM_RECOVER_QUERY: {
			if ( condorState == REMOVED || condorState == HELD ) {
				gmState = GM_CANCEL;
			} else {
				if ( m_lastRemoteStatusUpdate > enteredCurrentGmState ) {
					if ( remoteJobState == REMOTE_STATE_ACCEPTING ||
						 remoteJobState == REMOTE_STATE_ACCEPTED ||
						 remoteJobState == REMOTE_STATE_PREPARING ) {
						gmState = GM_STAGE_IN;
					} else {
						gmState = GM_SUBMITTED;
					}
				} else if ( m_currentStatusUnknown ) {
					gmState = GM_CANCEL;
				}
			}
			} break;
		case GM_UNSUBMITTED: {
			if ( condorState == REMOVED ) {
				gmState = GM_DELETE;
			} else if ( condorState == HELD ) {
				gmState = GM_DELETE;
				break;
			} else {
				gmState = GM_SUBMIT;
			}
			} break;
		case GM_SUBMIT: {
			if ( condorState == REMOVED || condorState == HELD ) {
				myResource->CancelSubmit( this );
				gmState = GM_UNSUBMITTED;
				break;
			}
			if ( numSubmitAttempts >= MAX_SUBMIT_ATTEMPTS ) {
//				jobAd->Assign( ATTR_HOLD_REASON,
//							   "Attempts to submit failed" );
				gmState = GM_HOLD;
				break;
			}
			// After a submit, wait at least submitInterval before trying
			// another one.
			if ( now >= lastSubmitAttempt + submitInterval ) {

				char *job_id = NULL;

				// Once RequestSubmit() is called at least once, you must
				// CancelRequest() once you're done with the request call
				if ( myResource->RequestSubmit( this ) == false ) {
					break;
				}

				if ( RSL == NULL ) {
					RSL = buildSubmitRSL();
				}
				if ( RSL == NULL ) {
					gmState = GM_HOLD;
					break;
				}
				rc = gahp->nordugrid_submit( 
										resourceManagerString,
										RSL->c_str(),
										job_id );
				if ( rc == GAHPCLIENT_COMMAND_NOT_SUBMITTED ||
					 rc == GAHPCLIENT_COMMAND_PENDING ) {
					break;
				}

				lastSubmitAttempt = time(NULL);
				numSubmitAttempts++;

				if ( rc == 0 ) {
					ASSERT( job_id != NULL );
					SetRemoteJobId( job_id );
					free( job_id );
					WriteGridSubmitEventToUserLog( jobAd );
					gmState = GM_SUBMIT_SAVE;
				} else {
					errorString = gahp->getErrorString();
					dprintf(D_ALWAYS,"(%d.%d) job submit failed: %s\n",
							procID.cluster, procID.proc,
							errorString.c_str() );
					myResource->CancelSubmit( this );
					gmState = GM_UNSUBMITTED;
				}

			} else {
				unsigned int delay = 0;
				if ( (lastSubmitAttempt + submitInterval) > now ) {
					delay = (lastSubmitAttempt + submitInterval) - now;
				}				
				daemonCore->Reset_Timer( evaluateStateTid, delay );
			}
			} break;
		case GM_SUBMIT_SAVE: {
			if ( condorState == REMOVED || condorState == HELD ) {
				gmState = GM_CANCEL;
			} else {
				jobAd->GetDirtyFlag( ATTR_GRID_JOB_ID, &attr_exists, &attr_dirty );
				if ( attr_exists && attr_dirty ) {
					requestScheddUpdate( this, true );
					break;
				}
				gmState = GM_STAGE_IN;
			}
			} break;
		case GM_STAGE_IN: {
			if ( stageList == NULL ) {
				const char *file;
				stageList = buildStageInList();
				stageList->rewind();
				while ( (file = stageList->next()) ) {
					if ( IsUrl( file ) ) {
						stageList->deleteCurrent();
					}
				}
			}
			rc = gahp->nordugrid_stage_in( resourceManagerString, remoteJobId,
										   *stageList );
			if ( rc == GAHPCLIENT_COMMAND_NOT_SUBMITTED ||
				 rc == GAHPCLIENT_COMMAND_PENDING ) {
				break;
			} else if ( rc != 0 ) {
				errorString = gahp->getErrorString();
				dprintf( D_ALWAYS, "(%d.%d) file stage in failed: %s\n",
						 procID.cluster, procID.proc, errorString.c_str() );
				gmState = GM_CANCEL;
			} else {
				gmState = GM_SUBMITTED;
			}
			} break;
		case GM_SUBMITTED: {
			if ( remoteJobState == REMOTE_STATE_FINISHED ||
				 remoteJobState == REMOTE_STATE_FAILED ||
				 remoteJobState == REMOTE_STATE_KILLED ||
				 remoteJobState == REMOTE_STATE_DELETED ) {
					gmState = GM_EXIT_INFO;
			} else if ( condorState == REMOVED || condorState == HELD ) {
				gmState = GM_CANCEL;
			} else {
				if ( lastProbeTime < enteredCurrentGmState ) {
					lastProbeTime = enteredCurrentGmState;
				}
				if ( probeNow ) {
					lastProbeTime = 0;
					probeNow = false;
				}
/*
				int probe_interval = myResource->GetJobPollInterval();
				if ( now >= lastProbeTime + probe_interval ) {
					gmState = GM_PROBE_JOB;
					break;
				}
				unsigned int delay = 0;
				if ( (lastProbeTime + probe_interval) > now ) {
					delay = (lastProbeTime + probe_interval) - now;
				}				
				daemonCore->Reset_Timer( evaluateStateTid, delay );
*/
			}
			} break;
		case GM_PROBE_JOB: {
			if ( condorState == REMOVED || condorState == HELD ) {
				gmState = GM_CANCEL;
			} else {
				char *new_status = NULL;
				rc = gahp->nordugrid_status( resourceManagerString,
											 remoteJobId, new_status );
				if ( rc == GAHPCLIENT_COMMAND_NOT_SUBMITTED ||
					 rc == GAHPCLIENT_COMMAND_PENDING ) {
					break;
				} else if ( rc != 0 ) {
					// What to do about failure?
					errorString = gahp->getErrorString();
					dprintf( D_ALWAYS, "(%d.%d) job probe failed: %s\n",
							 procID.cluster, procID.proc,
							 errorString.c_str() );
				} else {
					if ( new_status ) {
						remoteJobState = new_status;
					} else {
						remoteJobState = "";
					}
					SetRemoteJobStatus( new_status );
				}
				if ( new_status ) {
					free( new_status );
				}
				lastProbeTime = now;
				gmState = GM_SUBMITTED;
			}
			} break;
		case GM_EXIT_INFO: {
			std::string filter;
			StringList reply;
			formatstr( filter, "nordugrid-job-globalid=gsiftp://%s:2811/jobs/%s",
							resourceManagerString, remoteJobId );

			rc = gahp->nordugrid_ldap_query( resourceManagerString, "mds-vo-name=local,o=grid", filter.c_str(), "nordugrid-job-usedcputime,nordugrid-job-usedwalltime,nordugrid-job-exitcode", reply );
			if ( rc == GAHPCLIENT_COMMAND_NOT_SUBMITTED ||
				 rc == GAHPCLIENT_COMMAND_PENDING ) {
				break;
			} else if ( rc != 0 ) {
				errorString = gahp->getErrorString();
				dprintf( D_ALWAYS, "(%d.%d) exit info gathering failed: %s\n",
						 procID.cluster, procID.proc, errorString.c_str() );
				gmState = GM_CANCEL;
			} else {
				int exit_code = -1;
				int wallclock = -1;
				int cpu = -1;
				const char *entry;
				reply.rewind();
				while ( (entry = reply.next()) ) {
					if ( !strncmp( entry, "nordugrid-job-usedcputime: ", 27 ) ) {
						entry = strchr( entry, ' ' ) + 1;
						cpu = atoi( entry );
					} else if ( !strncmp( entry, "nordugrid-job-usedwalltime: ", 28 ) ) {
						entry = strchr( entry, ' ' ) + 1;
						wallclock = atoi( entry );
					} else if ( !strncmp( entry, "nordugrid-job-exitcode: ", 24 ) ) {
						entry = strchr( entry, ' ' ) + 1;
						exit_code = atoi( entry );
					}
				}
				if ( exit_code < 0 || wallclock < 0 || cpu < 0 ) {
					dprintf( D_ALWAYS, "(%d.%d) exit info missing\n",
							 procID.cluster, procID.proc );
					gmState = GM_CANCEL;
					break;
				}
				if ( exit_code > 128 ) {
					jobAd->Assign( ATTR_ON_EXIT_BY_SIGNAL, true );
					jobAd->Assign( ATTR_ON_EXIT_SIGNAL, exit_code - 128 );
				} else {
					jobAd->Assign( ATTR_ON_EXIT_BY_SIGNAL, false );
					jobAd->Assign( ATTR_ON_EXIT_CODE, exit_code );
				}
				jobAd->Assign( ATTR_JOB_REMOTE_WALL_CLOCK, wallclock * 60 );
				jobAd->Assign( ATTR_JOB_REMOTE_USER_CPU, cpu * 60 );
				gmState = GM_STAGE_OUT;
			}
			} break;
		case GM_STAGE_OUT: {
			if ( stageList == NULL ) {
				stageList = buildStageOutList();
			}
			if ( stageLocalList == NULL ) {
				const char *file;
				stageLocalList = buildStageOutLocalList( stageList );
				stageList->rewind();
				stageLocalList->rewind();
				while ( (file = stageLocalList->next()) ) {
					ASSERT( stageList->next() );
					if ( IsUrl( file ) ) {
						stageList->deleteCurrent();
						stageLocalList->deleteCurrent();
				}
				}
			}
			rc = gahp->nordugrid_stage_out2( resourceManagerString,
											 remoteJobId,
											 *stageList, *stageLocalList );
			if ( rc == GAHPCLIENT_COMMAND_NOT_SUBMITTED ||
				 rc == GAHPCLIENT_COMMAND_PENDING ) {
				break;
			} else if ( rc != 0 ) {
				errorString = gahp->getErrorString();
				dprintf( D_ALWAYS, "(%d.%d) file stage out failed: %s\n",
						 procID.cluster, procID.proc, errorString.c_str() );
				gmState = GM_CANCEL;
			} else {
				gmState = GM_DONE_SAVE;
			}
			} break;
		case GM_DONE_SAVE: {
			if ( condorState != HELD && condorState != REMOVED ) {
				JobTerminated();
				if ( condorState == COMPLETED ) {
					jobAd->GetDirtyFlag( ATTR_JOB_STATUS, &attr_exists, &attr_dirty );
					if ( attr_exists && attr_dirty ) {
						requestScheddUpdate( this, true );
						break;
					}
				}
			}
			gmState = GM_DONE_COMMIT;
			} break;
		case GM_DONE_COMMIT: {
			rc = gahp->nordugrid_cancel( resourceManagerString, remoteJobId );
			if ( rc == GAHPCLIENT_COMMAND_NOT_SUBMITTED ||
				 rc == GAHPCLIENT_COMMAND_PENDING ) {
				break;
			} else if ( rc != 0 ) {
				errorString = gahp->getErrorString();
				dprintf( D_ALWAYS, "(%d.%d) job cleanup failed: %s\n",
						 procID.cluster, procID.proc, errorString.c_str() );
				gmState = GM_HOLD;
				break;
			}
			myResource->CancelSubmit( this );
			if ( condorState == COMPLETED || condorState == REMOVED ) {
				gmState = GM_DELETE;
			} else {
				// Clear the contact string here because it may not get
				// cleared in GM_CLEAR_REQUEST (it might go to GM_HOLD first).
				if ( remoteJobId != NULL ) {
					SetRemoteJobId( NULL );
				}
				gmState = GM_CLEAR_REQUEST;
			}
			} break;
		case GM_CANCEL: {
			rc = gahp->nordugrid_cancel( resourceManagerString, remoteJobId );
			if ( rc == GAHPCLIENT_COMMAND_NOT_SUBMITTED ||
				 rc == GAHPCLIENT_COMMAND_PENDING ) {
				break;
			} else if ( rc == 0 ) {
				gmState = GM_FAILED;
			} else {
				// What to do about a failed cancel?
				errorString = gahp->getErrorString();
				dprintf( D_ALWAYS, "(%d.%d) job cancel failed: %s\n",
						 procID.cluster, procID.proc, errorString.c_str() );
				gmState = GM_FAILED;
			}
			} break;
		case GM_FAILED: {
			myResource->CancelSubmit( this );
			SetRemoteJobId( NULL );

			if ( condorState == REMOVED ) {
				gmState = GM_DELETE;
			} else {
				gmState = GM_CLEAR_REQUEST;
			}
			} break;
		case GM_DELETE: {
			// The job has completed or been removed. Delete it from the
			// schedd.
			DoneWithJob();
			// This object will be deleted when the update occurs
			} break;
		case GM_CLEAR_REQUEST: {
			// Remove all knowledge of any previous or present job
			// submission, in both the gridmanager and the schedd.

			// If we are doing a rematch, we are simply waiting around
			// for the schedd to be updated and subsequently this globus job
			// object to be destroyed.  So there is nothing to do.
			if ( wantRematch ) {
				break;
			}

			// For now, put problem jobs on hold instead of
			// forgetting about current submission and trying again.
			// TODO: Let our action here be dictated by the user preference
			// expressed in the job ad.
			if ( remoteJobId != NULL
				     && condorState != REMOVED 
					 && wantResubmit == 0 
					 && doResubmit == 0 ) {
				gmState = GM_HOLD;
				break;
			}
			// Only allow a rematch *if* we are also going to perform a resubmit
			if ( wantResubmit || doResubmit ) {
				jobAd->EvalBool(ATTR_REMATCH_CHECK,NULL,wantRematch);
			}
			if ( wantResubmit ) {
				wantResubmit = 0;
				dprintf(D_ALWAYS,
						"(%d.%d) Resubmitting to Globus because %s==TRUE\n",
						procID.cluster, procID.proc, ATTR_GLOBUS_RESUBMIT_CHECK );
			}
			if ( doResubmit ) {
				doResubmit = 0;
				dprintf(D_ALWAYS,
					"(%d.%d) Resubmitting to Globus (last submit failed)\n",
						procID.cluster, procID.proc );
			}
			errorString = "";
			if ( remoteJobId != NULL ) {
				SetRemoteJobId( NULL );
			}
			JobIdle();
			if ( submitLogged ) {
				JobEvicted();
				if ( !evictLogged ) {
					WriteEvictEventToUserLog( jobAd );
					evictLogged = true;
				}
			}
			myResource->CancelSubmit( this );
			
			if ( wantRematch ) {
				dprintf(D_ALWAYS,
						"(%d.%d) Requesting schedd to rematch job because %s==TRUE\n",
						procID.cluster, procID.proc, ATTR_REMATCH_CHECK );

				// Set ad attributes so the schedd finds a new match.
				int dummy;
				if ( jobAd->LookupBool( ATTR_JOB_MATCHED, dummy ) != 0 ) {
					jobAd->Assign( ATTR_JOB_MATCHED, false );
					jobAd->Assign( ATTR_CURRENT_HOSTS, 0 );
				}

				// If we are rematching, we need to forget about this job
				// cuz we wanna pull a fresh new job ad, with a fresh new match,
				// from the all-singing schedd.
				gmState = GM_DELETE;
				break;
			}
			
			// If there are no updates to be done when we first enter this
			// state, requestScheddUpdate will return done immediately
			// and not waste time with a needless connection to the
			// schedd. If updates need to be made, they won't show up in
			// schedd_actions after the first pass through this state
			// because we modified our local variables the first time
			// through. However, since we registered update events the
			// first time, requestScheddUpdate won't return done until
			// they've been committed to the schedd.
			const char *name;
			ExprTree *expr;
			jobAd->ResetExpr();
			if ( jobAd->NextDirtyExpr(name, expr) ) {
				requestScheddUpdate( this, true );
				break;
			}
			if ( remoteJobState != "" ) {
				remoteJobState = "";
				SetRemoteJobStatus( NULL );
			}
			submitLogged = false;
			executeLogged = false;
			submitFailedLogged = false;
			terminateLogged = false;
			abortLogged = false;
			evictLogged = false;
			gmState = GM_UNSUBMITTED;
			} break;
		case GM_HOLD: {
			// Put the job on hold in the schedd.
			// TODO: what happens if we learn here that the job is removed?
			// If the condor state is already HELD, then someone already
			// HELD it, so don't update anything else.
			if ( condorState != HELD ) {

				// Set the hold reason as best we can
				// TODO: set the hold reason in a more robust way.
				char holdReason[1024];
				holdReason[0] = '\0';
				holdReason[sizeof(holdReason)-1] = '\0';
				jobAd->LookupString( ATTR_HOLD_REASON, holdReason,
									 sizeof(holdReason) );
				if ( holdReason[0] == '\0' && errorString != "" ) {
					strncpy( holdReason, errorString.c_str(),
							 sizeof(holdReason) - 1 );
				}
				if ( holdReason[0] == '\0' ) {
					strncpy( holdReason, "Unspecified gridmanager error",
							 sizeof(holdReason) - 1 );
				}

				JobHeld( holdReason );
			}
			gmState = GM_DELETE;
			} break;
		default:
			EXCEPT( "(%d.%d) Unknown gmState %d!", procID.cluster,procID.proc,
					gmState );
		}

		if ( gmState != old_gm_state ) {
			reevaluate_state = true;
			dprintf(D_FULLDEBUG, "(%d.%d) gm state change: %s -> %s\n",
					procID.cluster, procID.proc, GMStateNames[old_gm_state],
					GMStateNames[gmState]);
			enteredCurrentGmState = time(NULL);

			// If we were calling a gahp call that used RSL, we're done
			// with it now, so free it.
			if ( RSL ) {
				delete RSL;
				RSL = NULL;
			}
			if ( stageList ) {
				delete stageList;
				stageList = NULL;
			}
			if ( stageLocalList ) {
				delete stageLocalList;
				stageLocalList = NULL;
			}
		}

	} while ( reevaluate_state );
}