void UnicoreJob::UpdateUnicoreState( const char *update_ad_string ) { ClassAd *update_ad; ClassAdXMLParser xml_parser; if ( update_ad_string == NULL ) { dprintf( D_ALWAYS, "(%d.%d) Received NULL unicore status ad string\n", procID.cluster, procID.proc ); return; } update_ad = xml_parser.ParseClassAd( update_ad_string ); UpdateUnicoreState( update_ad ); delete update_ad; }
void UnicoreJob::doEvaluateState() { int old_gm_state; int old_unicore_state; bool reevaluate_state = true; time_t now = time(NULL); bool attr_exists; bool attr_dirty; int rc; daemonCore->Reset_Timer( evaluateStateTid, TIMER_NEVER ); dprintf(D_ALWAYS, "(%d.%d) doEvaluateState called: gmState %s, unicoreState %d\n", procID.cluster,procID.proc,GMStateNames[gmState],unicoreState); if ( gahp ) { gahp->setMode( GahpClient::normal ); } do { reevaluate_state = false; old_gm_state = gmState; old_unicore_state = unicoreState; ASSERT ( gahp != NULL || gmState == GM_HOLD || gmState == GM_DELETE ); switch ( gmState ) { case GM_INIT: { // This is the state all jobs start in when the GlobusJob object // is first created. Here, we do things that we didn't want to // do in the constructor because they could block (the // constructor is called while we're connected to the schedd). if ( gahp->Startup() == false ) { dprintf( D_ALWAYS, "(%d.%d) Error starting up GAHP\n", procID.cluster, procID.proc ); jobAd->Assign( ATTR_HOLD_REASON, "Failed to start GAHP" ); gmState = GM_HOLD; break; } GahpClient::mode saved_mode = gahp->getMode(); gahp->setMode( GahpClient::blocking ); rc = gahp->unicore_job_callback( UnicoreGahpCallbackHandler ); if ( rc != GLOBUS_SUCCESS ) { dprintf( D_ALWAYS, "(%d.%d) Error enabling unicore callback, err=%d\n", procID.cluster, procID.proc, rc ); jobAd->Assign( ATTR_HOLD_REASON, "Failed to initialize GAHP" ); gmState = GM_HOLD; break; } gahp->setMode( saved_mode ); gmState = GM_START; } break; case GM_START: { // This state is the real start of the state machine, after // one-time initialization has been taken care of. // If we think there's a running jobmanager // out there, we try to register for callbacks (in GM_REGISTER). // The one way jobs can end up back in this state is if we // attempt a restart of a jobmanager only to be told that the // old jobmanager process is still alive. errorString = ""; if ( jobContact == NULL ) { gmState = GM_CLEAR_REQUEST; } else if ( wantResubmit || doResubmit ) { gmState = GM_CLEAR_REQUEST; } else { if ( condorState == RUNNING ) { executeLogged = true; } gmState = GM_RECOVER; } } break; case GM_RECOVER: { // We're recovering from a crash after the job was submitted. // Allow the gahp server to recover its internal state about // the job. if ( submitAd == NULL ) { submitAd = buildSubmitAd(); } if ( submitAd == NULL ) { gmState = GM_HOLD; break; } rc = gahp->unicore_job_recover( submitAd->c_str() ); if ( rc == GAHPCLIENT_COMMAND_NOT_SUBMITTED || rc == GAHPCLIENT_COMMAND_PENDING ) { break; } if ( rc != GLOBUS_SUCCESS ) { // unhandled error dprintf(D_ALWAYS,"(%d.%d) unicore_job_recover() failed\n", procID.cluster, procID.proc); gmState = GM_CANCEL; break; } gmState = GM_SUBMITTED; } break; case GM_UNSUBMITTED: { // There are no outstanding gram submissions for this job (if // there is one, we've given up on it). if ( condorState == REMOVED ) { gmState = GM_DELETE; } else if ( condorState == HELD ) { gmState = GM_DELETE; break; } else { gmState = GM_SUBMIT; } } break; case GM_SUBMIT: { // Start a new gram submission for this job. char *job_contact = NULL; if ( condorState == REMOVED || condorState == HELD ) { gmState = GM_UNSUBMITTED; break; } if ( numSubmitAttempts >= MAX_SUBMIT_ATTEMPTS ) { jobAd->Assign( ATTR_HOLD_REASON, "Attempts to submit failed" ); gmState = GM_HOLD; break; } // After a submit, wait at least submitInterval before trying // another one. if ( now >= lastSubmitAttempt + submitInterval ) { if ( submitAd == NULL ) { submitAd = buildSubmitAd(); } if ( submitAd == NULL ) { gmState = GM_HOLD; break; } rc = gahp->unicore_job_create( submitAd->c_str(), &job_contact ); if ( rc == GAHPCLIENT_COMMAND_NOT_SUBMITTED || rc == GAHPCLIENT_COMMAND_PENDING ) { break; } lastSubmitAttempt = time(NULL); numSubmitAttempts++; if ( rc == GLOBUS_SUCCESS ) { // job_contact was strdup()ed for us. Now we take // responsibility for free()ing it. SetRemoteJobId( job_contact ); free( job_contact ); WriteGridSubmitEventToUserLog( jobAd ); gmState = GM_SUBMIT_SAVE; } else { // unhandled error dprintf(D_ALWAYS,"(%d.%d) unicore_job_create() failed\n", procID.cluster, procID.proc); dprintf(D_ALWAYS,"(%d.%d) submitAd='%s'\n", procID.cluster, procID.proc,submitAd->c_str()); if ( job_contact ) { free( job_contact ); } gmState = GM_UNSUBMITTED; reevaluate_state = true; } } else if ( condorState == REMOVED || condorState == HELD ) { gmState = GM_UNSUBMITTED; } else { unsigned int delay = 0; if ( (lastSubmitAttempt + submitInterval) > now ) { delay = (lastSubmitAttempt + submitInterval) - now; } daemonCore->Reset_Timer( evaluateStateTid, delay ); } } break; case GM_SUBMIT_SAVE: { // Save the jobmanager's contact for a new gram submission. if ( condorState == REMOVED || condorState == HELD ) { gmState = GM_CANCEL; } else { jobAd->GetDirtyFlag( ATTR_GRID_JOB_ID, &attr_exists, &attr_dirty ); if ( attr_exists && attr_dirty ) { requestScheddUpdate( this, true ); break; } gmState = GM_SUBMIT_COMMIT; } } break; case GM_SUBMIT_COMMIT: { // Now that we've saved the jobmanager's contact, commit the // gram job submission. if ( condorState == REMOVED || condorState == HELD ) { gmState = GM_CANCEL; } else { rc = gahp->unicore_job_start( jobContact ); if ( rc == GAHPCLIENT_COMMAND_NOT_SUBMITTED || rc == GAHPCLIENT_COMMAND_PENDING ) { break; } if ( rc != GLOBUS_SUCCESS ) { // unhandled error dprintf(D_ALWAYS,"(%d.%d) unicore_job_start() failed\n", procID.cluster, procID.proc); gmState = GM_CANCEL; } else { gmState = GM_SUBMITTED; } } } break; case GM_SUBMITTED: { // The job has been submitted (or is about to be by the // jobmanager). Wait for completion or failure, and probe the // jobmanager occassionally to make it's still alive. if ( unicoreState == COMPLETED ) { gmState = GM_DONE_SAVE; // } else if ( unicoreState == GLOBUS_GRAM_PROTOCOL_JOB_STATE_FAILED ) { // gmState = GM_CANCEL; } else if ( condorState == REMOVED || condorState == HELD ) { gmState = GM_CANCEL; } else if ( newRemoteStatusAd ) { dprintf(D_FULLDEBUG,"(%d.%d) *** Processing callback ad\n",procID.cluster, procID.proc ); lastProbeTime = now; UpdateUnicoreState( newRemoteStatusAd ); delete newRemoteStatusAd; newRemoteStatusAd = NULL; reevaluate_state = true; /* Now that the gahp tells us when a job status changes, we don't need to * do active probes. } else { if ( lastProbeTime < enteredCurrentGmState ) { lastProbeTime = enteredCurrentGmState; } if ( probeNow ) { lastProbeTime = 0; probeNow = false; } if ( now >= lastProbeTime + probeInterval ) { gmState = GM_PROBE_JOBMANAGER; break; } unsigned int delay = 0; if ( (lastProbeTime + probeInterval) > now ) { delay = (lastProbeTime + probeInterval) - now; } daemonCore->Reset_Timer( evaluateStateTid, delay ); */ } } break; case GM_PROBE_JOBMANAGER: { if ( condorState == REMOVED || condorState == HELD ) { gmState = GM_CANCEL; } else { char *status_ad = NULL; rc = gahp->unicore_job_status( jobContact, &status_ad ); if ( rc == GAHPCLIENT_COMMAND_NOT_SUBMITTED || rc == GAHPCLIENT_COMMAND_PENDING ) { break; } if ( rc != GLOBUS_SUCCESS ) { // unhandled error dprintf(D_ALWAYS,"(%d.%d) unicore_job_status() failed\n", procID.cluster, procID.proc); if ( status_ad ) { free( status_ad ); } gmState = GM_CANCEL; break; } UpdateUnicoreState( status_ad ); if ( status_ad ) { free( status_ad ); } if ( newRemoteStatusAd ) { delete newRemoteStatusAd; newRemoteStatusAd = NULL; } lastProbeTime = now; gmState = GM_SUBMITTED; } } break; case GM_DONE_SAVE: { // Report job completion to the schedd. JobTerminated(); if ( condorState == COMPLETED ) { jobAd->GetDirtyFlag( ATTR_JOB_STATUS, &attr_exists, &attr_dirty ); if ( attr_exists && attr_dirty ) { requestScheddUpdate( this, true ); break; } } gmState = GM_DONE_COMMIT; } break; case GM_DONE_COMMIT: { // Tell the jobmanager it can clean up and exit. rc = gahp->unicore_job_destroy( jobContact ); if ( rc == GAHPCLIENT_COMMAND_NOT_SUBMITTED || rc == GAHPCLIENT_COMMAND_PENDING ) { break; } if ( rc != GLOBUS_SUCCESS ) { // unhandled error dprintf(D_ALWAYS,"(%d.%d) unicore_job_destroy() failed\n", procID.cluster, procID.proc); gmState = GM_CANCEL; break; } if ( condorState == COMPLETED || condorState == REMOVED ) { gmState = GM_DELETE; } else { // Clear the contact string here because it may not get // cleared in GM_CLEAR_REQUEST (it might go to GM_HOLD first). // SetRemoteJobId( NULL ); // gmState = GM_CLEAR_REQUEST; gmState = GM_HOLD; } } break; case GM_CANCEL: { // We need to cancel the job submission. // if ( unicoreState != COMPLETED && // unicoreState != GLOBUS_GRAM_PROTOCOL_JOB_STATE_FAILED ) { if ( unicoreState != COMPLETED ) { rc = gahp->unicore_job_destroy( jobContact ); if ( rc == GAHPCLIENT_COMMAND_NOT_SUBMITTED || rc == GAHPCLIENT_COMMAND_PENDING ) { break; } if ( rc != GLOBUS_SUCCESS ) { // unhandled error dprintf(D_ALWAYS,"(%d.%d) unicore_job_destroy() failed\n", procID.cluster, procID.proc); gmState = GM_HOLD; break; } } if ( condorState == REMOVED ) { gmState = GM_DELETE; } else { gmState = GM_CLEAR_REQUEST; } } break; case GM_DELETE: { // We are done with the job. Propagate any remaining updates // to the schedd, then delete this object. DoneWithJob(); // This object will be deleted when the update occurs } break; case GM_CLEAR_REQUEST: { // Remove all knowledge of any previous or present job // submission, in both the gridmanager and the schedd. errorString = ""; SetRemoteJobId( NULL ); SetRemoteJobStatus( NULL ); JobIdle(); // If there are no updates to be done when we first enter this // state, requestScheddUpdate will return done immediately // and not waste time with a needless connection to the // schedd. If updates need to be made, they won't show up in // schedd_actions after the first pass through this state // because we modified our local variables the first time // through. However, since we registered update events the // first time, requestScheddUpdate won't return done until // they've been committed to the schedd. const char *name; ExprTree *expr; jobAd->ResetExpr(); if ( jobAd->NextDirtyExpr(name, expr) ) { requestScheddUpdate( this, true ); break; } executeLogged = false; terminateLogged = false; abortLogged = false; evictLogged = false; gmState = GM_UNSUBMITTED; } break; case GM_HOLD: { // Put the job on hold in the schedd. // TODO: what happens if we learn here that the job is removed? // if ( jobContact && // unicoreState != GLOBUS_GRAM_PROTOCOL_JOB_STATE_UNKNOWN ) { // unicoreState = GLOBUS_GRAM_PROTOCOL_JOB_STATE_UNKNOWN; // } // If the condor state is already HELD, then someone already // HELD it, so don't update anything else. if ( condorState != HELD ) { // Set the hold reason as best we can // TODO: set the hold reason in a more robust way. char holdReason[1024]; holdReason[0] = '\0'; holdReason[sizeof(holdReason)-1] = '\0'; jobAd->LookupString( ATTR_HOLD_REASON, holdReason, sizeof(holdReason) ); if ( holdReason[0] == '\0' && errorString != "" ) { strncpy( holdReason, errorString.c_str(), sizeof(holdReason) - 1 ); } if ( holdReason[0] == '\0' ) { strncpy( holdReason, "Unspecified gridmanager error", sizeof(holdReason) - 1 ); } JobHeld( holdReason ); } gmState = GM_DELETE; } break; default: EXCEPT( "(%d.%d) Unknown gmState %d!", procID.cluster,procID.proc, gmState ); } if ( gmState != old_gm_state || unicoreState != old_unicore_state ) { reevaluate_state = true; } if ( unicoreState != old_unicore_state ) { // dprintf(D_FULLDEBUG, "(%d.%d) globus state change: %s -> %s\n", // procID.cluster, procID.proc, // GlobusJobStatusName(old_globus_state), // GlobusJobStatusName(globusState)); enteredCurrentUnicoreState = time(NULL); } if ( gmState != old_gm_state ) { dprintf(D_FULLDEBUG, "(%d.%d) gm state change: %s -> %s\n", procID.cluster, procID.proc, GMStateNames[old_gm_state], GMStateNames[gmState]); enteredCurrentGmState = time(NULL); // If we were waiting for a pending unicore call, we're not // anymore so purge it. if ( gahp ) { gahp->purgePendingRequests(); } // If we were calling unicore_job_create and using submitAd, // we're done with it now, so free it. if ( submitAd ) { delete submitAd; submitAd = NULL; } } } while ( reevaluate_state ); }