void BaseShadow::emailRemoveEvent( const char* reason ) { Email mailer; mailer.sendRemove( jobAd, reason ); }
/** * A job is exiting the Starter and we need to take necessary * actions. First we will update the job's ad file with various * information about what the job did. Next, if the job completed on * its own, we'll want to call the StarterUserPolicy's checkAtExit(), * which handles setting the right exit status to control the job's * final state in the job queue. If the job is being killed from "unnatural" * causes, such as a condor_rm, then we will figure out the right * update type is for the job and write an EVICT event to the user log. * * @param exit_status - the exit status of the job from wait() * This is not used currently * @param reason - the Condor-defined reason why the job is exiting * @param user_proc - the Proc object for this job * @return true if the job was set to exit properly * @see h/exit.h **/ bool JICLocalSchedd::notifyJobExit( int, int reason, UserProc* user_proc ) { // Remember what steps we've completed, in case we need to retry. static bool did_final_ad_publish = false; static bool did_schedd_update = false; static bool did_check_at_exit = false; static bool did_ulog_event = false; m_tried_notify_job_exit = true; if (!did_final_ad_publish) { // Prepare to update the job queue. In this case, we want // to publish all the same attribute we'd otherwise send // to the shadow, but instead, just stick them directly // into our copy of the job classad. Starter->publishPreScriptUpdateAd( job_ad ); if( user_proc ) { user_proc->PublishUpdateAd( job_ad ); } Starter->publishPostScriptUpdateAd( job_ad ); did_final_ad_publish = true; } // Only check to see what we should do with our job // in the user policy object if the job terminated // on its own. Otherwise, we've already been there // and done that. if ( reason == JOB_EXITED || reason == JOB_COREDUMPED ) { if( !did_check_at_exit ) { // What should be the return value for this? // Can I just assume that things went well? this->starter_user_policy->checkAtExit( ); did_check_at_exit = true; } } else if( reason == JOB_MISSED_DEFERRAL_TIME ) { // // This is suppose to be temporary until we have some kind // of error handling in place for jobs that never started // Andy Pavlo - 01.24.2006 - [email protected] // exit_code = JOB_MISSED_DEFERRAL_TIME; } if( !did_ulog_event ) { // Use the final exit code to determine what event to log. // This may be different from what is indicated by 'reason', // because a policy expression evaluted by checkAtExit() may // have changed things. switch( this->exit_code ) { case JOB_EXITED: this->u_log->logTerminate( this->job_ad ); did_ulog_event = true; break; case JOB_SHOULD_REQUEUE: // Following the baseshadow, if the job is being requeued // then it is an eviction event this->u_log->logRequeueEvent( this->job_ad, false ); did_ulog_event = true; break; case JOB_SHOULD_REMOVE: case JOB_SHOULD_HOLD: case JOB_MISSED_DEFERRAL_TIME: // NOTE: The local universe's log actions are not consistent // with what the Shadow does. This is because the Shadow is // not consistent with itself; for example, a condor_rm // will cause an EVICT notice in the user log, but a // periodic remove will not. This is something Derek // said he will clean up later on. For now, however, we are // going to be consistent with ourself in the local universe // and ALWAYS send an eviction notice when the job is // removed this->u_log->logEvict( this->job_ad, false ); did_ulog_event = true; break; default: EXCEPT("Internal error in JICLocalSchedd::notifyJobExit: unexpected exit code %d",this->exit_code); } } if( !did_schedd_update ) { // Use the final exit code to determine the update type. // This may be different from what is indicated by 'reason', // because a policy expression evaluted by checkAtExit() may // have changed things. update_t up_type = U_TERMINATE; switch( this->exit_code ) { case JOB_EXITED: up_type = U_TERMINATE; break; case JOB_SHOULD_REQUEUE: up_type = U_REQUEUE; break; case JOB_SHOULD_REMOVE: up_type = U_REMOVE; break; case JOB_SHOULD_HOLD: case JOB_MISSED_DEFERRAL_TIME: up_type = U_HOLD; break; default: EXCEPT("Internal error in JICLocalSchedd::notifyJobExit: unexpected exit code %d",this->exit_code); } // Now that we've logged the event, we can update the job queue // If we're doing a fast shutdown, don't retry on failure. if ( !this->job_updater->updateJob( up_type ) && !fast_exit ) { dprintf( D_ALWAYS, "Failed to update job queue - attempting to retry.\n" ); retryJobCleanup(); return ( false ); } did_schedd_update = true; } // // Once the job's been updated in the queue, we can also try // sending email notification, if desired. // This returns void, so there's no way to test for failure. // Therefore, we don't bother with retry. // Email msg; switch( this->exit_code ) { case JOB_SHOULD_REQUEUE: case JOB_EXITED: msg.sendExit( job_ad, reason ); break; case JOB_SHOULD_REMOVE: { char *remove_reason = NULL; this->job_ad->LookupString( ATTR_REMOVE_REASON, &remove_reason ); msg.sendRemove( this->job_ad, remove_reason ? remove_reason : "" ); free( remove_reason ); break; } case JOB_SHOULD_HOLD: { char *hold_reason = NULL; this->job_ad->LookupString( ATTR_HOLD_REASON, &hold_reason ); msg.sendHold( this->job_ad, hold_reason ? hold_reason : "" ); free( hold_reason ); break; } case JOB_MISSED_DEFERRAL_TIME: msg.sendHold( this->job_ad, "missed derreral time" ); break; default: EXCEPT("Internal error in JICLocalSchedd::notifyJobExit: unexpected exit code %d",this->exit_code); } // // Lastly, we will call to write out the file. This was // originally done in JICLocal::notifyJobExit(), but we no // longer call that // this->writeOutputAdFile( this->job_ad ); // // Once we get here, everything has been successfully // wrapped up. // return true; }