void HoldJob( const char* long_reason, const char* short_reason, int reason_code, int reason_subcode ) { char subject[ BUFSIZ ]; FILE *mailer; sprintf( subject, "Condor Job %d.%d put on hold\n", Proc->id.cluster, Proc->id.proc ); if( ! JobAd ) { dprintf( D_ALWAYS, "In HoldJob() w/ NULL JobAd!\n" ); exit( JOB_SHOULD_HOLD ); } ExitReason = JOB_SHOULD_HOLD; if ( !ConnectQ(schedd, SHADOW_QMGMT_TIMEOUT) ) { dprintf( D_ALWAYS, "Failed to connect to schedd!\n" ); } SetAttributeString( Proc->id.cluster, Proc->id.proc, ATTR_HOLD_REASON, short_reason ); SetAttributeInt( Proc->id.cluster, Proc->id.proc, ATTR_HOLD_REASON_CODE, reason_code ); SetAttributeInt( Proc->id.cluster, Proc->id.proc, ATTR_HOLD_REASON_SUBCODE, reason_subcode ); if ( !DisconnectQ(0) ) { dprintf( D_ALWAYS, "Failed to commit updated job queue status!\n" ); } mailer = email_user_open(JobAd, subject); if( ! mailer ) { // User didn't want email, so just exit now with the right // value so the schedd actually puts the job on hold. dprintf( D_ALWAYS, "Job going into Hold state.\n"); dprintf( D_ALWAYS, "********** Shadow Exiting(%d) **********\n", JOB_SHOULD_HOLD); exit( JOB_SHOULD_HOLD ); } fprintf( mailer, "Your condor job " ); if( Proc->args_v1or2[0] ) { ArgList args; MyString args_string; args.AppendArgsV1or2Raw(Proc->args_v1or2[0],NULL); args.GetArgsStringForDisplay(&args_string); fprintf( mailer, "%s %s ", Proc->cmd[0], args_string.Value() ); } else { fprintf( mailer, "%s ", Proc->cmd[0] ); } fprintf( mailer, "\nis being put on hold.\n\n" ); fprintf( mailer, "%s", long_reason ); email_close(mailer); // Now that the user knows why, exit with the right code. dprintf( D_ALWAYS, "Job going into Hold state.\n"); dprintf( D_ALWAYS, "********** Shadow Exiting(%d) **********\n", JOB_SHOULD_HOLD); exit( JOB_SHOULD_HOLD ); }
//--------------------------------------------------------------------------- void DagmanClassad::SetDagAttribute( const char *attrName, const MyString &value ) { if ( SetAttributeString( _dagmanId._cluster, _dagmanId._proc, attrName, value.Value() ) != 0 ) { debug_printf( DEBUG_QUIET, "WARNING: failed to set attribute %s\n", attrName ); check_warning_strictness( DAG_STRICT_3 ); } }
void update_job_status( struct rusage *localp, struct rusage *remotep ) { int status = -1; double utime = 0.0; double stime = 0.0; int tot_sus=0, cum_sus=0, last_sus=0; char buf[1024*50]; // If the job completed, and there is no HISTORY file specified, // the don't bother to update the job ClassAd since it is about to be // flushed into the bit bucket by the schedd anyway. char *myHistoryFile = param("HISTORY"); if ((Proc->status == COMPLETED) && (myHistoryFile==NULL)) { return; } if (myHistoryFile) { free(myHistoryFile); } if (!JobAd) { EXCEPT( "update_job_status(): No job ad"); } JobAd->LookupInteger(ATTR_TOTAL_SUSPENSIONS, tot_sus); JobAd->LookupInteger(ATTR_CUMULATIVE_SUSPENSION_TIME, cum_sus); JobAd->LookupInteger(ATTR_LAST_SUSPENSION_TIME, last_sus); //new syntax, can use filesystem to authenticate if (!ConnectQ(schedd, SHADOW_QMGMT_TIMEOUT) || GetAttributeInt(Proc->id.cluster, Proc->id.proc, ATTR_JOB_STATUS, &status) < 0) { EXCEPT("Failed to connect to schedd!"); } job_report_update_queue( Proc ); if( status == REMOVED ) { dprintf( D_ALWAYS, "update_job_status(): Job %d.%d has been removed " "by condor_rm\n", Proc->id.cluster, Proc->id.proc ); } else { SetAttributeInt(Proc->id.cluster, Proc->id.proc, ATTR_TOTAL_SUSPENSIONS, tot_sus); SetAttributeInt(Proc->id.cluster, Proc->id.proc, ATTR_CUMULATIVE_SUSPENSION_TIME, cum_sus); SetAttributeInt(Proc->id.cluster, Proc->id.proc, ATTR_LAST_SUSPENSION_TIME, last_sus); update_job_rusage( localp, remotep ); Proc->image_size = ImageSize; SetAttributeInt(Proc->id.cluster, Proc->id.proc, ATTR_IMAGE_SIZE, ImageSize); // For standard universe. MemoryUsed==ImageSize, no need to param this one. // because imagesize is already the best measure of memory usage. SetAttribute(Proc->id.cluster, Proc->id.proc, ATTR_MEMORY_USAGE, "((ImageSize+1023)/1024)"); SetAttributeInt(Proc->id.cluster, Proc->id.proc, ATTR_JOB_EXIT_STATUS, JobExitStatus); rusage_to_float( Proc->local_usage, &utime, &stime ); SetAttributeFloat(Proc->id.cluster, Proc->id.proc, ATTR_JOB_LOCAL_USER_CPU, utime); SetAttributeFloat(Proc->id.cluster, Proc->id.proc, ATTR_JOB_LOCAL_SYS_CPU, stime); rusage_to_float( Proc->remote_usage[0], &utime, &stime ); SetAttributeFloat(Proc->id.cluster, Proc->id.proc, ATTR_JOB_REMOTE_USER_CPU, utime); SetAttributeFloat(Proc->id.cluster, Proc->id.proc, ATTR_JOB_REMOTE_SYS_CPU, stime); dprintf(D_FULLDEBUG,"TIME DEBUG 3 USR remotep=%lu Proc=%lu utime=%f\n", remotep->ru_utime.tv_sec, Proc->remote_usage[0].ru_utime.tv_sec, utime); dprintf(D_FULLDEBUG,"TIME DEBUG 4 SYS remotep=%lu Proc=%lu utime=%f\n", remotep->ru_stime.tv_sec, Proc->remote_usage[0].ru_stime.tv_sec, stime); if( sock_RSC1 ) { float TotalBytesSentUpdate = TotalBytesSent + sock_RSC1->get_bytes_sent() + BytesSent; float TotalBytesRecvdUpdate = TotalBytesRecvd + sock_RSC1->get_bytes_recvd() + BytesRecvd; SetAttributeFloat( Proc->id.cluster, Proc->id.proc, ATTR_BYTES_SENT, TotalBytesSentUpdate ); SetAttributeFloat( Proc->id.cluster, Proc->id.proc, ATTR_BYTES_RECVD, TotalBytesRecvdUpdate ); float RSCBytesSentUpdate = sock_RSC1->get_bytes_sent() + RSCBytesSent; float RSCBytesRecvdUpdate = sock_RSC1->get_bytes_recvd() + RSCBytesRecvd; SetAttributeFloat( Proc->id.cluster, Proc->id.proc, ATTR_RSC_BYTES_SENT, RSCBytesSentUpdate ); SetAttributeFloat( Proc->id.cluster, Proc->id.proc, ATTR_RSC_BYTES_RECVD, RSCBytesRecvdUpdate ); } if( ExitReason == JOB_CKPTED || ExitReason == JOB_NOT_CKPTED ) { SetAttributeInt( Proc->id.cluster, Proc->id.proc, ATTR_LAST_VACATE_TIME, time(0) ); } if( ExitReason == JOB_CKPTED || LastCkptTime > LastRestartTime ) { int uncommitted_suspension_time = 0; JobAd->LookupInteger(ATTR_UNCOMMITTED_SUSPENSION_TIME, uncommitted_suspension_time); if( uncommitted_suspension_time > 0 ) { int committed_suspension_time = 0; GetAttributeInt(Proc->id.cluster, Proc->id.proc, ATTR_COMMITTED_SUSPENSION_TIME, &committed_suspension_time); committed_suspension_time += uncommitted_suspension_time; SetAttributeInt(Proc->id.cluster, Proc->id.proc, ATTR_COMMITTED_SUSPENSION_TIME, committed_suspension_time); } } // if we had checkpointed, then save all of these attributes as well. if (LastCkptTime > LastRestartTime) { SetAttributeInt(Proc->id.cluster, Proc->id.proc, ATTR_LAST_CKPT_TIME, LastCkptTime); CommittedTime=0; GetAttributeInt(Proc->id.cluster, Proc->id.proc, ATTR_JOB_COMMITTED_TIME, &CommittedTime); CommittedTime += LastCkptTime - LastRestartTime; SetAttributeInt(Proc->id.cluster, Proc->id.proc, ATTR_JOB_COMMITTED_TIME, CommittedTime); LastRestartTime = LastCkptTime; SetAttributeInt(Proc->id.cluster, Proc->id.proc, ATTR_NUM_CKPTS, NumCkpts); SetAttributeInt(Proc->id.cluster, Proc->id.proc, ATTR_NUM_RESTARTS, NumRestarts); if (Executing_Arch) { SetAttributeString(Proc->id.cluster, Proc->id.proc, ATTR_CKPT_ARCH, Executing_Arch); } if (Executing_OpSys) { SetAttributeString(Proc->id.cluster, Proc->id.proc, ATTR_CKPT_OPSYS, Executing_OpSys); } // If we wrote a checkpoint, store the location in the // LastCkptServer attribute. If we didn't use a checkpoint // server (i.e., we stored it locally), then make sure // no LastCkptServer attribute is set. if (LastCkptServer) { SetAttributeString(Proc->id.cluster, Proc->id.proc, ATTR_LAST_CKPT_SERVER, LastCkptServer); } else { DeleteAttribute(Proc->id.cluster, Proc->id.proc, ATTR_LAST_CKPT_SERVER); } if (LastCkptPlatform) { SetAttributeString(Proc->id.cluster, Proc->id.proc, ATTR_LAST_CHECKPOINT_PLATFORM, LastCkptPlatform); } } // if the job completed, we should include the run-time in // committed time, since it contributed to the completion of // the job. Also, commit the exit code/signal stuff, plus any // core filenames. if (Proc->status == COMPLETED) { int exit_code, exit_signal, exit_by_signal; int pending; // update the time. CommittedTime = 0; GetAttributeInt(Proc->id.cluster, Proc->id.proc, ATTR_JOB_COMMITTED_TIME, &CommittedTime); CommittedTime += Proc->completion_date - LastRestartTime; SetAttributeInt(Proc->id.cluster, Proc->id.proc, ATTR_JOB_COMMITTED_TIME, CommittedTime); // if there is a core file, update that too. if (JobAd->LookupString(ATTR_JOB_CORE_FILENAME, buf, sizeof(buf))) { SetAttributeString(Proc->id.cluster, Proc->id.proc, ATTR_JOB_CORE_FILENAME, buf); } // only new style ads have ATTR_ON_EXIT_BY_SIGNAL, so only // SetAttribute for those types of ads if (JobAd->LookupInteger(ATTR_ON_EXIT_BY_SIGNAL, exit_by_signal)==1) { SetAttributeInt(Proc->id.cluster, Proc->id.proc, ATTR_ON_EXIT_BY_SIGNAL, exit_by_signal); if (exit_by_signal == 1) /* exited via signal */ { JobAd->LookupInteger(ATTR_ON_EXIT_SIGNAL, exit_signal); SetAttributeInt(Proc->id.cluster, Proc->id.proc, ATTR_ON_EXIT_SIGNAL, exit_signal); } else { JobAd->LookupInteger(ATTR_ON_EXIT_CODE, exit_code); SetAttributeInt(Proc->id.cluster, Proc->id.proc, ATTR_ON_EXIT_CODE, exit_code); } } // and now, let's try and mark this job as a terminate pending // job. If the job already is, then fine. We'll mark it again. if (JobAd->LookupBool(ATTR_TERMINATION_PENDING, pending)) { SetAttribute(Proc->id.cluster, Proc->id.proc, ATTR_TERMINATION_PENDING, pending?"TRUE":"FALSE"); } else { // if it isn't in the job ad, then add it to the saved ad in the // schedd. SetAttribute(Proc->id.cluster, Proc->id.proc, ATTR_TERMINATION_PENDING, "TRUE"); } // store the reason why the job is marked completed. if (JobAd->LookupString(ATTR_TERMINATION_REASON, buf, sizeof(buf))) { SetAttributeString(Proc->id.cluster, Proc->id.proc, ATTR_TERMINATION_REASON, buf); } // Set up the exit code the shadow was about to exit with to // help support the terminate pending "state". SetAttributeInt(Proc->id.cluster, Proc->id.proc, ATTR_TERMINATION_EXITREASON, ExitReason); // Put the job status as created by waitpid() into the job ad // itself. This is to implement the terminate_pending feature. It // is done like this because EVERYWHERE in this codebase we do // stuff like WIFEXITED(JobStatus) and it turns out there are no // user level macros to will one of those status values as returned // by waitpid() into existance. So, we'll put it directly into the // job ad to prevent me having to reimplement a few large functions // which deal with JobStatus directly--as it is sadly a global // variable. SetAttributeInt(Proc->id.cluster, Proc->id.proc, ATTR_WAITPID_STATUS, JobStatus); } } if (!DisconnectQ(0)) { EXCEPT("Failed to commit updated job queue status!"); } }