Exemplo n.º 1
0
void
BaseShadow::terminateJob( update_style_t kind ) // has a default argument of US_NORMAL
{
	int reason;
	bool signaled;
	MyString str;

	if( ! jobAd ) {
		dprintf( D_ALWAYS, "In terminateJob() w/ NULL JobAd!" );
	}

	/* The first thing we do is record that we are in a termination pending
		state. */
	if (kind == US_NORMAL) {
		str.formatstr("%s = TRUE", ATTR_TERMINATION_PENDING);
		jobAd->Insert(str.Value());
	}

	if (kind == US_TERMINATE_PENDING) {
		// In this case, the job had already completed once and the
		// status had been saved to the job queue, however, for
		// some reason, the shadow didn't exit with a good value and
		// the job had been requeued. When this style of update
		// is used, it is a shortcut from the very birth of the shadow
		// to here, and so there will not be a remote resource or
		// anything like that set up. In this situation, we just
		// want to write the log event and mail the user and exit
		// with a good exit code so the schedd removes the job from
		// the queue. If for some reason the logging fails once again,
		// the process continues to repeat. 
		// This means at least once semantics for the termination event
		// and user email, but at no time should the job actually execute
		// again.

		int exited_by_signal = FALSE;
		int exit_signal = 0;
		int exit_code = 0;

		getJobAdExitedBySignal(jobAd, exited_by_signal);
		if (exited_by_signal == TRUE) {
			getJobAdExitSignal(jobAd, exit_signal);
		} else {
			getJobAdExitCode(jobAd, exit_code);
		}

		if (exited_by_signal == TRUE) {
			reason = JOB_COREDUMPED;
			str.formatstr("%s = \"%s\"", ATTR_JOB_CORE_FILENAME, core_file_name);
			jobAd->Insert(str.Value());
		} else {
			reason = JOB_EXITED;
		}

		dprintf( D_ALWAYS, "Job %d.%d terminated: %s %d\n",
	 		getCluster(), getProc(), 
	 		exited_by_signal? "killed by signal" : "exited with status",
	 		exited_by_signal ? exit_signal : exit_code );
		
			// write stuff to user log, but get values from jobad
		logTerminateEvent( reason, kind );

			// email the user, but get values from jobad
		emailTerminateEvent( reason, kind );

		DC_Exit( reason );
	}

	// the default path when kind == US_NORMAL

	// cleanup this shadow (kill starters, etc)
	cleanUp();

	reason = getExitReason();
	signaled = exitedBySignal();

	/* also store the corefilename into the jobad so we can recover this 
		during a termination pending scenario. */
	if( reason == JOB_COREDUMPED ) {
		str.formatstr("%s = \"%s\"", ATTR_JOB_CORE_FILENAME, getCoreName());
		jobAd->Insert(str.Value());
	}

    // Update final Job committed time
    int last_ckpt_time = 0;
    jobAd->LookupInteger(ATTR_LAST_CKPT_TIME, last_ckpt_time);
    int current_start_time = 0;
    jobAd->LookupInteger(ATTR_JOB_CURRENT_START_DATE, current_start_time);
    int int_value = (last_ckpt_time > current_start_time) ?
                        last_ckpt_time : current_start_time;

    if( int_value > 0 ) {
        int job_committed_time = 0;
        jobAd->LookupInteger(ATTR_JOB_COMMITTED_TIME, job_committed_time);
		int delta = (int)time(NULL) - int_value;
        job_committed_time += delta;
        jobAd->Assign(ATTR_JOB_COMMITTED_TIME, job_committed_time);

		float slot_weight = 1;
		jobAd->LookupFloat(ATTR_JOB_MACHINE_ATTR_SLOT_WEIGHT0, slot_weight);
		float slot_time = 0;
		jobAd->LookupFloat(ATTR_COMMITTED_SLOT_TIME, slot_time);
		slot_time += slot_weight * delta;
		jobAd->Assign(ATTR_COMMITTED_SLOT_TIME, slot_time);
    }

	CommitSuspensionTime(jobAd);

	// update the job ad in the queue with some important final
	// attributes so we know what happened to the job when using
	// condor_history...
    if (m_num_cleanup_retries < 1 &&
        param_boolean("SHADOW_TEST_JOB_CLEANUP_RETRY", false)) {
		dprintf( D_ALWAYS,
				 "Testing Failure to perform final update to job queue!\n");
		retryJobCleanup();
		return;
    }
	if( !updateJobInQueue(U_TERMINATE) ) {
		dprintf( D_ALWAYS, 
				 "Failed to perform final update to job queue!\n");
		retryJobCleanup();
		return;
	}

	// Let's maximize the effectiveness of that queue synchronization and
	// only record the job as done if the update to the queue was successful.
	// If any of these next operations fail and the shadow exits with an
	// exit code which causes the job to get requeued, it will be in the
	// "terminate pending" state marked by the ATTR_TERMINATION_PENDING
	// attribute.

	dprintf( D_ALWAYS, "Job %d.%d terminated: %s %d\n",
	 	getCluster(), getProc(), 
	 	signaled ? "killed by signal" : "exited with status",
	 	signaled ? exitSignal() : exitCode() );

	// write stuff to user log:
	logTerminateEvent( reason );

	// email the user
	emailTerminateEvent( reason );

	if( reason == JOB_EXITED && claimIsClosing() ) {
			// Startd not accepting any more jobs on this claim.
			// We do this here to avoid having to treat this case
			// identically to JOB_EXITED in the code leading up to
			// this point.
		dprintf(D_FULLDEBUG,"Startd is closing claim, so no more jobs can be run on it.\n");
		reason = JOB_EXITED_AND_CLAIM_CLOSING;
	}

	// try to get a new job for this shadow
	if( recycleShadow(reason) ) {
		// recycleShadow delete's this, so we must return immediately
		return;
	}

	// does not return.
	DC_Exit( reason );
}
Exemplo n.º 2
0
/**
 * A job is exiting the Starter and we need to take necessary
 * actions. First we will update the job's ad file with various
 * information about what the job did. Next, if the job completed on
 * its own, we'll want to call the StarterUserPolicy's checkAtExit(),
 * which handles setting the right exit status to control the job's
 * final state in the job queue. If the job is being killed from "unnatural"
 * causes, such as a condor_rm, then we will figure out the right
 * update type is for the job and write an EVICT event to the user log.
 * 
 * @param exit_status - the exit status of the job from wait()
 * This is not used currently
 * @param reason - the Condor-defined reason why the job is exiting
 * @param user_proc - the Proc object for this job
 * @return true if the job was set to exit properly
 * @see h/exit.h
 **/
bool
JICLocalSchedd::notifyJobExit( int, int reason, 
							   UserProc* user_proc )
{
		// Remember what steps we've completed, in case we need to retry.
	static bool did_final_ad_publish = false;
	static bool did_schedd_update = false;
	static bool did_check_at_exit = false;
	static bool did_ulog_event = false;

	m_tried_notify_job_exit = true;
 
	if (!did_final_ad_publish) {
			// Prepare to update the job queue.  In this case, we want
			// to publish all the same attribute we'd otherwise send
			// to the shadow, but instead, just stick them directly
			// into our copy of the job classad.
		Starter->publishPreScriptUpdateAd( job_ad );
		if( user_proc ) {
			user_proc->PublishUpdateAd( job_ad );
		}
		Starter->publishPostScriptUpdateAd( job_ad );
		did_final_ad_publish = true;
	}
	
		// Only check to see what we should do with our job 
		// in the user policy object if the job terminated
		// on its own.  Otherwise, we've already been there
		// and done that.
	if ( reason == JOB_EXITED || reason == JOB_COREDUMPED ) {
		if( !did_check_at_exit ) {
				// What should be the return value for this?
				// Can I just assume that things went well?
			this->starter_user_policy->checkAtExit( );
			did_check_at_exit = true;
		}
	}
	else if( reason == JOB_MISSED_DEFERRAL_TIME ) {
			//
			// This is suppose to be temporary until we have some kind
			// of error handling in place for jobs that never started
			// Andy Pavlo - 01.24.2006 - [email protected]
			//
		exit_code = JOB_MISSED_DEFERRAL_TIME;
	}

	if( !did_ulog_event ) {
			// Use the final exit code to determine what event to log.
			// This may be different from what is indicated by 'reason',
			// because a policy expression evaluted by checkAtExit() may
			// have changed things.
		switch( this->exit_code ) {
		case JOB_EXITED:
			this->u_log->logTerminate( this->job_ad );
			did_ulog_event = true;
			break;
		case JOB_SHOULD_REQUEUE:
			// Following the baseshadow, if the job is being requeued
			// then it is an eviction event
			this->u_log->logRequeueEvent( this->job_ad, false );
			did_ulog_event = true;
			break;
		case JOB_SHOULD_REMOVE:
		case JOB_SHOULD_HOLD:
		case JOB_MISSED_DEFERRAL_TIME:
			// NOTE: The local universe's log actions are not consistent
			// with what the Shadow does. This is because the Shadow is
			// not consistent with itself; for example, a condor_rm
			// will cause an EVICT notice in the user log, but a 
			// periodic remove will not. This is something Derek
			// said he will clean up later on. For now, however, we are
			// going to be consistent with ourself in the local universe
			// and ALWAYS send an eviction notice when the job is 
			// removed
			this->u_log->logEvict( this->job_ad, false );
			did_ulog_event = true;
			break;
		default:
			EXCEPT("Internal error in JICLocalSchedd::notifyJobExit: unexpected exit code %d",this->exit_code);
		}
	}


	if( !did_schedd_update ) {
			// Use the final exit code to determine the update type.
			// This may be different from what is indicated by 'reason',
			// because a policy expression evaluted by checkAtExit() may
			// have changed things.
		update_t up_type = U_TERMINATE;
		switch( this->exit_code ) {
		case JOB_EXITED:
			up_type = U_TERMINATE;
			break;
		case JOB_SHOULD_REQUEUE:
			up_type = U_REQUEUE;
			break;
		case JOB_SHOULD_REMOVE:
			up_type = U_REMOVE;
			break;
		case JOB_SHOULD_HOLD:
		case JOB_MISSED_DEFERRAL_TIME:
			up_type = U_HOLD;
			break;
		default:
			EXCEPT("Internal error in JICLocalSchedd::notifyJobExit: unexpected exit code %d",this->exit_code);
		}

			// Now that we've logged the event, we can update the job queue
			// If we're doing a fast shutdown, don't retry on failure.
		if ( !this->job_updater->updateJob( up_type ) && !fast_exit ) {
			dprintf( D_ALWAYS,
			         "Failed to update job queue - attempting to retry.\n" );
			retryJobCleanup();
			return ( false );
		}

		did_schedd_update = true;
	}

		//
		// Once the job's been updated in the queue, we can also try
		// sending email notification, if desired.
		// This returns void, so there's no way to test for failure.
		// Therefore, we don't bother with retry.
		//
	Email msg;
	switch( this->exit_code ) {
	case JOB_SHOULD_REQUEUE:
	case JOB_EXITED:
		msg.sendExit( job_ad, reason );
		break;
	case JOB_SHOULD_REMOVE: {
		char *remove_reason = NULL;
		this->job_ad->LookupString( ATTR_REMOVE_REASON, &remove_reason );
		msg.sendRemove( this->job_ad, remove_reason ? remove_reason : "" );
		free( remove_reason );
		break;
	}
	case JOB_SHOULD_HOLD: {
		char *hold_reason = NULL;
		this->job_ad->LookupString( ATTR_HOLD_REASON, &hold_reason );
		msg.sendHold( this->job_ad, hold_reason ? hold_reason : "" );
		free( hold_reason );
		break;
	}
	case JOB_MISSED_DEFERRAL_TIME:
		msg.sendHold( this->job_ad, "missed derreral time" );
		break;
	default:
		EXCEPT("Internal error in JICLocalSchedd::notifyJobExit: unexpected exit code %d",this->exit_code);
	}

		//
		// Lastly, we will call to write out the file. This was 
		// originally done in JICLocal::notifyJobExit(), but we no
		// longer call that
		//
	this->writeOutputAdFile( this->job_ad );

		//
		// Once we get here, everything has been successfully
		// wrapped up.
		//
	return true;
}