Beispiel #1
0
void
BaseShadow::emailHoldEvent( const char* reason ) 
{
	Email mailer;
	mailer.sendHold( jobAd, reason );
}
Beispiel #2
0
/**
 * A job is exiting the Starter and we need to take necessary
 * actions. First we will update the job's ad file with various
 * information about what the job did. Next, if the job completed on
 * its own, we'll want to call the StarterUserPolicy's checkAtExit(),
 * which handles setting the right exit status to control the job's
 * final state in the job queue. If the job is being killed from "unnatural"
 * causes, such as a condor_rm, then we will figure out the right
 * update type is for the job and write an EVICT event to the user log.
 * 
 * @param exit_status - the exit status of the job from wait()
 * This is not used currently
 * @param reason - the Condor-defined reason why the job is exiting
 * @param user_proc - the Proc object for this job
 * @return true if the job was set to exit properly
 * @see h/exit.h
 **/
bool
JICLocalSchedd::notifyJobExit( int, int reason, 
							   UserProc* user_proc )
{
		// Remember what steps we've completed, in case we need to retry.
	static bool did_final_ad_publish = false;
	static bool did_schedd_update = false;
	static bool did_check_at_exit = false;
	static bool did_ulog_event = false;

	m_tried_notify_job_exit = true;
 
	if (!did_final_ad_publish) {
			// Prepare to update the job queue.  In this case, we want
			// to publish all the same attribute we'd otherwise send
			// to the shadow, but instead, just stick them directly
			// into our copy of the job classad.
		Starter->publishPreScriptUpdateAd( job_ad );
		if( user_proc ) {
			user_proc->PublishUpdateAd( job_ad );
		}
		Starter->publishPostScriptUpdateAd( job_ad );
		did_final_ad_publish = true;
	}
	
		// Only check to see what we should do with our job 
		// in the user policy object if the job terminated
		// on its own.  Otherwise, we've already been there
		// and done that.
	if ( reason == JOB_EXITED || reason == JOB_COREDUMPED ) {
		if( !did_check_at_exit ) {
				// What should be the return value for this?
				// Can I just assume that things went well?
			this->starter_user_policy->checkAtExit( );
			did_check_at_exit = true;
		}
	}
	else if( reason == JOB_MISSED_DEFERRAL_TIME ) {
			//
			// This is suppose to be temporary until we have some kind
			// of error handling in place for jobs that never started
			// Andy Pavlo - 01.24.2006 - [email protected]
			//
		exit_code = JOB_MISSED_DEFERRAL_TIME;
	}

	if( !did_ulog_event ) {
			// Use the final exit code to determine what event to log.
			// This may be different from what is indicated by 'reason',
			// because a policy expression evaluted by checkAtExit() may
			// have changed things.
		switch( this->exit_code ) {
		case JOB_EXITED:
			this->u_log->logTerminate( this->job_ad );
			did_ulog_event = true;
			break;
		case JOB_SHOULD_REQUEUE:
			// Following the baseshadow, if the job is being requeued
			// then it is an eviction event
			this->u_log->logRequeueEvent( this->job_ad, false );
			did_ulog_event = true;
			break;
		case JOB_SHOULD_REMOVE:
		case JOB_SHOULD_HOLD:
		case JOB_MISSED_DEFERRAL_TIME:
			// NOTE: The local universe's log actions are not consistent
			// with what the Shadow does. This is because the Shadow is
			// not consistent with itself; for example, a condor_rm
			// will cause an EVICT notice in the user log, but a 
			// periodic remove will not. This is something Derek
			// said he will clean up later on. For now, however, we are
			// going to be consistent with ourself in the local universe
			// and ALWAYS send an eviction notice when the job is 
			// removed
			this->u_log->logEvict( this->job_ad, false );
			did_ulog_event = true;
			break;
		default:
			EXCEPT("Internal error in JICLocalSchedd::notifyJobExit: unexpected exit code %d",this->exit_code);
		}
	}


	if( !did_schedd_update ) {
			// Use the final exit code to determine the update type.
			// This may be different from what is indicated by 'reason',
			// because a policy expression evaluted by checkAtExit() may
			// have changed things.
		update_t up_type = U_TERMINATE;
		switch( this->exit_code ) {
		case JOB_EXITED:
			up_type = U_TERMINATE;
			break;
		case JOB_SHOULD_REQUEUE:
			up_type = U_REQUEUE;
			break;
		case JOB_SHOULD_REMOVE:
			up_type = U_REMOVE;
			break;
		case JOB_SHOULD_HOLD:
		case JOB_MISSED_DEFERRAL_TIME:
			up_type = U_HOLD;
			break;
		default:
			EXCEPT("Internal error in JICLocalSchedd::notifyJobExit: unexpected exit code %d",this->exit_code);
		}

			// Now that we've logged the event, we can update the job queue
			// If we're doing a fast shutdown, don't retry on failure.
		if ( !this->job_updater->updateJob( up_type ) && !fast_exit ) {
			dprintf( D_ALWAYS,
			         "Failed to update job queue - attempting to retry.\n" );
			retryJobCleanup();
			return ( false );
		}

		did_schedd_update = true;
	}

		//
		// Once the job's been updated in the queue, we can also try
		// sending email notification, if desired.
		// This returns void, so there's no way to test for failure.
		// Therefore, we don't bother with retry.
		//
	Email msg;
	switch( this->exit_code ) {
	case JOB_SHOULD_REQUEUE:
	case JOB_EXITED:
		msg.sendExit( job_ad, reason );
		break;
	case JOB_SHOULD_REMOVE: {
		char *remove_reason = NULL;
		this->job_ad->LookupString( ATTR_REMOVE_REASON, &remove_reason );
		msg.sendRemove( this->job_ad, remove_reason ? remove_reason : "" );
		free( remove_reason );
		break;
	}
	case JOB_SHOULD_HOLD: {
		char *hold_reason = NULL;
		this->job_ad->LookupString( ATTR_HOLD_REASON, &hold_reason );
		msg.sendHold( this->job_ad, hold_reason ? hold_reason : "" );
		free( hold_reason );
		break;
	}
	case JOB_MISSED_DEFERRAL_TIME:
		msg.sendHold( this->job_ad, "missed derreral time" );
		break;
	default:
		EXCEPT("Internal error in JICLocalSchedd::notifyJobExit: unexpected exit code %d",this->exit_code);
	}

		//
		// Lastly, we will call to write out the file. This was 
		// originally done in JICLocal::notifyJobExit(), but we no
		// longer call that
		//
	this->writeOutputAdFile( this->job_ad );

		//
		// Once we get here, everything has been successfully
		// wrapped up.
		//
	return true;
}