void BaseShadow::terminateJob( update_style_t kind ) // has a default argument of US_NORMAL { int reason; bool signaled; MyString str; if( ! jobAd ) { dprintf( D_ALWAYS, "In terminateJob() w/ NULL JobAd!" ); } /* The first thing we do is record that we are in a termination pending state. */ if (kind == US_NORMAL) { str.formatstr("%s = TRUE", ATTR_TERMINATION_PENDING); jobAd->Insert(str.Value()); } if (kind == US_TERMINATE_PENDING) { // In this case, the job had already completed once and the // status had been saved to the job queue, however, for // some reason, the shadow didn't exit with a good value and // the job had been requeued. When this style of update // is used, it is a shortcut from the very birth of the shadow // to here, and so there will not be a remote resource or // anything like that set up. In this situation, we just // want to write the log event and mail the user and exit // with a good exit code so the schedd removes the job from // the queue. If for some reason the logging fails once again, // the process continues to repeat. // This means at least once semantics for the termination event // and user email, but at no time should the job actually execute // again. int exited_by_signal = FALSE; int exit_signal = 0; int exit_code = 0; getJobAdExitedBySignal(jobAd, exited_by_signal); if (exited_by_signal == TRUE) { getJobAdExitSignal(jobAd, exit_signal); } else { getJobAdExitCode(jobAd, exit_code); } if (exited_by_signal == TRUE) { reason = JOB_COREDUMPED; str.formatstr("%s = \"%s\"", ATTR_JOB_CORE_FILENAME, core_file_name); jobAd->Insert(str.Value()); } else { reason = JOB_EXITED; } dprintf( D_ALWAYS, "Job %d.%d terminated: %s %d\n", getCluster(), getProc(), exited_by_signal? "killed by signal" : "exited with status", exited_by_signal ? exit_signal : exit_code ); // write stuff to user log, but get values from jobad logTerminateEvent( reason, kind ); // email the user, but get values from jobad emailTerminateEvent( reason, kind ); DC_Exit( reason ); } // the default path when kind == US_NORMAL // cleanup this shadow (kill starters, etc) cleanUp(); reason = getExitReason(); signaled = exitedBySignal(); /* also store the corefilename into the jobad so we can recover this during a termination pending scenario. */ if( reason == JOB_COREDUMPED ) { str.formatstr("%s = \"%s\"", ATTR_JOB_CORE_FILENAME, getCoreName()); jobAd->Insert(str.Value()); } // Update final Job committed time int last_ckpt_time = 0; jobAd->LookupInteger(ATTR_LAST_CKPT_TIME, last_ckpt_time); int current_start_time = 0; jobAd->LookupInteger(ATTR_JOB_CURRENT_START_DATE, current_start_time); int int_value = (last_ckpt_time > current_start_time) ? last_ckpt_time : current_start_time; if( int_value > 0 ) { int job_committed_time = 0; jobAd->LookupInteger(ATTR_JOB_COMMITTED_TIME, job_committed_time); int delta = (int)time(NULL) - int_value; job_committed_time += delta; jobAd->Assign(ATTR_JOB_COMMITTED_TIME, job_committed_time); float slot_weight = 1; jobAd->LookupFloat(ATTR_JOB_MACHINE_ATTR_SLOT_WEIGHT0, slot_weight); float slot_time = 0; jobAd->LookupFloat(ATTR_COMMITTED_SLOT_TIME, slot_time); slot_time += slot_weight * delta; jobAd->Assign(ATTR_COMMITTED_SLOT_TIME, slot_time); } CommitSuspensionTime(jobAd); // update the job ad in the queue with some important final // attributes so we know what happened to the job when using // condor_history... if (m_num_cleanup_retries < 1 && param_boolean("SHADOW_TEST_JOB_CLEANUP_RETRY", false)) { dprintf( D_ALWAYS, "Testing Failure to perform final update to job queue!\n"); retryJobCleanup(); return; } if( !updateJobInQueue(U_TERMINATE) ) { dprintf( D_ALWAYS, "Failed to perform final update to job queue!\n"); retryJobCleanup(); return; } // Let's maximize the effectiveness of that queue synchronization and // only record the job as done if the update to the queue was successful. // If any of these next operations fail and the shadow exits with an // exit code which causes the job to get requeued, it will be in the // "terminate pending" state marked by the ATTR_TERMINATION_PENDING // attribute. dprintf( D_ALWAYS, "Job %d.%d terminated: %s %d\n", getCluster(), getProc(), signaled ? "killed by signal" : "exited with status", signaled ? exitSignal() : exitCode() ); // write stuff to user log: logTerminateEvent( reason ); // email the user emailTerminateEvent( reason ); if( reason == JOB_EXITED && claimIsClosing() ) { // Startd not accepting any more jobs on this claim. // We do this here to avoid having to treat this case // identically to JOB_EXITED in the code leading up to // this point. dprintf(D_FULLDEBUG,"Startd is closing claim, so no more jobs can be run on it.\n"); reason = JOB_EXITED_AND_CLAIM_CLOSING; } // try to get a new job for this shadow if( recycleShadow(reason) ) { // recycleShadow delete's this, so we must return immediately return; } // does not return. DC_Exit( reason ); }
/** * A job is exiting the Starter and we need to take necessary * actions. First we will update the job's ad file with various * information about what the job did. Next, if the job completed on * its own, we'll want to call the StarterUserPolicy's checkAtExit(), * which handles setting the right exit status to control the job's * final state in the job queue. If the job is being killed from "unnatural" * causes, such as a condor_rm, then we will figure out the right * update type is for the job and write an EVICT event to the user log. * * @param exit_status - the exit status of the job from wait() * This is not used currently * @param reason - the Condor-defined reason why the job is exiting * @param user_proc - the Proc object for this job * @return true if the job was set to exit properly * @see h/exit.h **/ bool JICLocalSchedd::notifyJobExit( int, int reason, UserProc* user_proc ) { // Remember what steps we've completed, in case we need to retry. static bool did_final_ad_publish = false; static bool did_schedd_update = false; static bool did_check_at_exit = false; static bool did_ulog_event = false; m_tried_notify_job_exit = true; if (!did_final_ad_publish) { // Prepare to update the job queue. In this case, we want // to publish all the same attribute we'd otherwise send // to the shadow, but instead, just stick them directly // into our copy of the job classad. Starter->publishPreScriptUpdateAd( job_ad ); if( user_proc ) { user_proc->PublishUpdateAd( job_ad ); } Starter->publishPostScriptUpdateAd( job_ad ); did_final_ad_publish = true; } // Only check to see what we should do with our job // in the user policy object if the job terminated // on its own. Otherwise, we've already been there // and done that. if ( reason == JOB_EXITED || reason == JOB_COREDUMPED ) { if( !did_check_at_exit ) { // What should be the return value for this? // Can I just assume that things went well? this->starter_user_policy->checkAtExit( ); did_check_at_exit = true; } } else if( reason == JOB_MISSED_DEFERRAL_TIME ) { // // This is suppose to be temporary until we have some kind // of error handling in place for jobs that never started // Andy Pavlo - 01.24.2006 - [email protected] // exit_code = JOB_MISSED_DEFERRAL_TIME; } if( !did_ulog_event ) { // Use the final exit code to determine what event to log. // This may be different from what is indicated by 'reason', // because a policy expression evaluted by checkAtExit() may // have changed things. switch( this->exit_code ) { case JOB_EXITED: this->u_log->logTerminate( this->job_ad ); did_ulog_event = true; break; case JOB_SHOULD_REQUEUE: // Following the baseshadow, if the job is being requeued // then it is an eviction event this->u_log->logRequeueEvent( this->job_ad, false ); did_ulog_event = true; break; case JOB_SHOULD_REMOVE: case JOB_SHOULD_HOLD: case JOB_MISSED_DEFERRAL_TIME: // NOTE: The local universe's log actions are not consistent // with what the Shadow does. This is because the Shadow is // not consistent with itself; for example, a condor_rm // will cause an EVICT notice in the user log, but a // periodic remove will not. This is something Derek // said he will clean up later on. For now, however, we are // going to be consistent with ourself in the local universe // and ALWAYS send an eviction notice when the job is // removed this->u_log->logEvict( this->job_ad, false ); did_ulog_event = true; break; default: EXCEPT("Internal error in JICLocalSchedd::notifyJobExit: unexpected exit code %d",this->exit_code); } } if( !did_schedd_update ) { // Use the final exit code to determine the update type. // This may be different from what is indicated by 'reason', // because a policy expression evaluted by checkAtExit() may // have changed things. update_t up_type = U_TERMINATE; switch( this->exit_code ) { case JOB_EXITED: up_type = U_TERMINATE; break; case JOB_SHOULD_REQUEUE: up_type = U_REQUEUE; break; case JOB_SHOULD_REMOVE: up_type = U_REMOVE; break; case JOB_SHOULD_HOLD: case JOB_MISSED_DEFERRAL_TIME: up_type = U_HOLD; break; default: EXCEPT("Internal error in JICLocalSchedd::notifyJobExit: unexpected exit code %d",this->exit_code); } // Now that we've logged the event, we can update the job queue // If we're doing a fast shutdown, don't retry on failure. if ( !this->job_updater->updateJob( up_type ) && !fast_exit ) { dprintf( D_ALWAYS, "Failed to update job queue - attempting to retry.\n" ); retryJobCleanup(); return ( false ); } did_schedd_update = true; } // // Once the job's been updated in the queue, we can also try // sending email notification, if desired. // This returns void, so there's no way to test for failure. // Therefore, we don't bother with retry. // Email msg; switch( this->exit_code ) { case JOB_SHOULD_REQUEUE: case JOB_EXITED: msg.sendExit( job_ad, reason ); break; case JOB_SHOULD_REMOVE: { char *remove_reason = NULL; this->job_ad->LookupString( ATTR_REMOVE_REASON, &remove_reason ); msg.sendRemove( this->job_ad, remove_reason ? remove_reason : "" ); free( remove_reason ); break; } case JOB_SHOULD_HOLD: { char *hold_reason = NULL; this->job_ad->LookupString( ATTR_HOLD_REASON, &hold_reason ); msg.sendHold( this->job_ad, hold_reason ? hold_reason : "" ); free( hold_reason ); break; } case JOB_MISSED_DEFERRAL_TIME: msg.sendHold( this->job_ad, "missed derreral time" ); break; default: EXCEPT("Internal error in JICLocalSchedd::notifyJobExit: unexpected exit code %d",this->exit_code); } // // Lastly, we will call to write out the file. This was // originally done in JICLocal::notifyJobExit(), but we no // longer call that // this->writeOutputAdFile( this->job_ad ); // // Once we get here, everything has been successfully // wrapped up. // return true; }