Example #1
0
void
HoldJob( const char* long_reason, const char* short_reason, int reason_code,
		 int reason_subcode )
{
    char subject[ BUFSIZ ];	
	FILE *mailer;

	sprintf( subject, "Condor Job %d.%d put on hold\n", 
			 Proc->id.cluster, Proc->id.proc );

	if( ! JobAd ) {
		dprintf( D_ALWAYS, "In HoldJob() w/ NULL JobAd!\n" );
		exit( JOB_SHOULD_HOLD );
	}

	ExitReason = JOB_SHOULD_HOLD;
	if ( !ConnectQ(schedd, SHADOW_QMGMT_TIMEOUT) ) {
		dprintf( D_ALWAYS, "Failed to connect to schedd!\n" );
	}
	SetAttributeString( Proc->id.cluster, Proc->id.proc, ATTR_HOLD_REASON,
						short_reason );
	SetAttributeInt( Proc->id.cluster, Proc->id.proc, ATTR_HOLD_REASON_CODE,
					 reason_code );
	SetAttributeInt( Proc->id.cluster, Proc->id.proc, ATTR_HOLD_REASON_SUBCODE,
					 reason_subcode );
	if ( !DisconnectQ(0) ) {
		dprintf( D_ALWAYS, "Failed to commit updated job queue status!\n" );
	}

	mailer = email_user_open(JobAd, subject);
	if( ! mailer ) {
			// User didn't want email, so just exit now with the right
			// value so the schedd actually puts the job on hold.
		dprintf( D_ALWAYS, "Job going into Hold state.\n");
		dprintf( D_ALWAYS, "********** Shadow Exiting(%d) **********\n",
			JOB_SHOULD_HOLD);
		exit( JOB_SHOULD_HOLD );
	}

	fprintf( mailer, "Your condor job " );
	if( Proc->args_v1or2[0] ) {
		ArgList args;
		MyString args_string;
		args.AppendArgsV1or2Raw(Proc->args_v1or2[0],NULL);
		args.GetArgsStringForDisplay(&args_string);

		fprintf( mailer, "%s %s ", Proc->cmd[0], args_string.Value() );
	} else {
		fprintf( mailer, "%s ", Proc->cmd[0] );
	}
	fprintf( mailer, "\nis being put on hold.\n\n" );
	fprintf( mailer, "%s", long_reason );
	email_close(mailer);

		// Now that the user knows why, exit with the right code. 
	dprintf( D_ALWAYS, "Job going into Hold state.\n");
	dprintf( D_ALWAYS, "********** Shadow Exiting(%d) **********\n",
		JOB_SHOULD_HOLD);
	exit( JOB_SHOULD_HOLD );
}
Example #2
0
//---------------------------------------------------------------------------
void
DagmanClassad::SetDagAttribute( const char *attrName, const MyString &value )
{
	if ( SetAttributeString( _dagmanId._cluster, _dagmanId._proc,
						  attrName, value.Value() ) != 0 ) {
		debug_printf( DEBUG_QUIET,
					  "WARNING: failed to set attribute %s\n", attrName );
		check_warning_strictness( DAG_STRICT_3 );
	}
}
Example #3
0
void
update_job_status( struct rusage *localp, struct rusage *remotep )
{
	int		status = -1;
	double utime = 0.0;
	double stime = 0.0;
	int tot_sus=0, cum_sus=0, last_sus=0;
	char buf[1024*50];

	// If the job completed, and there is no HISTORY file specified,
	// the don't bother to update the job ClassAd since it is about to be
	// flushed into the bit bucket by the schedd anyway.
	char *myHistoryFile = param("HISTORY");
	if ((Proc->status == COMPLETED) && (myHistoryFile==NULL)) {
		return;
	}

	if (myHistoryFile) {
		free(myHistoryFile);
	}

	if (!JobAd)
	{
		EXCEPT( "update_job_status(): No job ad");
	}
	JobAd->LookupInteger(ATTR_TOTAL_SUSPENSIONS, tot_sus);
	JobAd->LookupInteger(ATTR_CUMULATIVE_SUSPENSION_TIME, cum_sus);
	JobAd->LookupInteger(ATTR_LAST_SUSPENSION_TIME, last_sus);

	//new syntax, can use filesystem to authenticate
	if (!ConnectQ(schedd, SHADOW_QMGMT_TIMEOUT) ||
		GetAttributeInt(Proc->id.cluster, Proc->id.proc, ATTR_JOB_STATUS,
						&status) < 0) {
		EXCEPT("Failed to connect to schedd!");
	}
	job_report_update_queue( Proc );

	if( status == REMOVED ) {
		dprintf( D_ALWAYS, "update_job_status(): Job %d.%d has been removed "
				 "by condor_rm\n", Proc->id.cluster, Proc->id.proc );
	} else {

		SetAttributeInt(Proc->id.cluster, Proc->id.proc, 
			ATTR_TOTAL_SUSPENSIONS, tot_sus);

		SetAttributeInt(Proc->id.cluster, Proc->id.proc, 
			ATTR_CUMULATIVE_SUSPENSION_TIME, cum_sus);

		SetAttributeInt(Proc->id.cluster, Proc->id.proc, 
			ATTR_LAST_SUSPENSION_TIME, last_sus);

		update_job_rusage( localp, remotep );

		Proc->image_size = ImageSize;

		SetAttributeInt(Proc->id.cluster, Proc->id.proc, ATTR_IMAGE_SIZE, 
						ImageSize);
		// For standard universe. MemoryUsed==ImageSize, no need to param this one.
		// because imagesize is already the best measure of memory usage.
		SetAttribute(Proc->id.cluster, Proc->id.proc, ATTR_MEMORY_USAGE, "((ImageSize+1023)/1024)");

		SetAttributeInt(Proc->id.cluster, Proc->id.proc, ATTR_JOB_EXIT_STATUS,
						JobExitStatus);

		rusage_to_float( Proc->local_usage, &utime, &stime );
		SetAttributeFloat(Proc->id.cluster, Proc->id.proc,
						  ATTR_JOB_LOCAL_USER_CPU, utime);
		SetAttributeFloat(Proc->id.cluster, Proc->id.proc,
						  ATTR_JOB_LOCAL_SYS_CPU, stime);

		rusage_to_float( Proc->remote_usage[0], &utime, &stime );
		SetAttributeFloat(Proc->id.cluster, Proc->id.proc,
						  ATTR_JOB_REMOTE_USER_CPU, utime);
		SetAttributeFloat(Proc->id.cluster, Proc->id.proc,
						  ATTR_JOB_REMOTE_SYS_CPU, stime);
		dprintf(D_FULLDEBUG,"TIME DEBUG 3 USR remotep=%lu Proc=%lu utime=%f\n",
				remotep->ru_utime.tv_sec,
				Proc->remote_usage[0].ru_utime.tv_sec, utime);
		dprintf(D_FULLDEBUG,"TIME DEBUG 4 SYS remotep=%lu Proc=%lu utime=%f\n",
				remotep->ru_stime.tv_sec,
				Proc->remote_usage[0].ru_stime.tv_sec, stime);

		if( sock_RSC1 ) {
			float TotalBytesSentUpdate =
				TotalBytesSent + sock_RSC1->get_bytes_sent() + BytesSent;
			float TotalBytesRecvdUpdate =
				TotalBytesRecvd + sock_RSC1->get_bytes_recvd() + BytesRecvd;
			SetAttributeFloat( Proc->id.cluster, Proc->id.proc,
							   ATTR_BYTES_SENT, TotalBytesSentUpdate );
			SetAttributeFloat( Proc->id.cluster, Proc->id.proc,
							   ATTR_BYTES_RECVD, TotalBytesRecvdUpdate );
			float RSCBytesSentUpdate =
				sock_RSC1->get_bytes_sent() + RSCBytesSent;
			float RSCBytesRecvdUpdate = 
				sock_RSC1->get_bytes_recvd() + RSCBytesRecvd;
			SetAttributeFloat( Proc->id.cluster, Proc->id.proc,
							   ATTR_RSC_BYTES_SENT, RSCBytesSentUpdate );
			SetAttributeFloat( Proc->id.cluster, Proc->id.proc,
							   ATTR_RSC_BYTES_RECVD, RSCBytesRecvdUpdate );
		}

		if( ExitReason == JOB_CKPTED || ExitReason == JOB_NOT_CKPTED ) {
			SetAttributeInt( Proc->id.cluster, Proc->id.proc,
							 ATTR_LAST_VACATE_TIME, time(0) );
		}

		if( ExitReason == JOB_CKPTED || LastCkptTime > LastRestartTime ) {
			int uncommitted_suspension_time = 0;
			JobAd->LookupInteger(ATTR_UNCOMMITTED_SUSPENSION_TIME, uncommitted_suspension_time);
			if( uncommitted_suspension_time > 0 ) {
				int committed_suspension_time = 0;
				GetAttributeInt(Proc->id.cluster, Proc->id.proc,
								ATTR_COMMITTED_SUSPENSION_TIME, &committed_suspension_time);
				committed_suspension_time += uncommitted_suspension_time;
				SetAttributeInt(Proc->id.cluster, Proc->id.proc,
								ATTR_COMMITTED_SUSPENSION_TIME, committed_suspension_time);
			}
		}

		// if we had checkpointed, then save all of these attributes as well.
		if (LastCkptTime > LastRestartTime) {
			SetAttributeInt(Proc->id.cluster, Proc->id.proc,
							ATTR_LAST_CKPT_TIME, LastCkptTime);
			CommittedTime=0;
			GetAttributeInt(Proc->id.cluster, Proc->id.proc,
							ATTR_JOB_COMMITTED_TIME, &CommittedTime);
			CommittedTime += LastCkptTime - LastRestartTime;
			SetAttributeInt(Proc->id.cluster, Proc->id.proc,
							ATTR_JOB_COMMITTED_TIME, CommittedTime);
			LastRestartTime = LastCkptTime;
			SetAttributeInt(Proc->id.cluster, Proc->id.proc,
							ATTR_NUM_CKPTS, NumCkpts);
			SetAttributeInt(Proc->id.cluster, Proc->id.proc,
							ATTR_NUM_RESTARTS, NumRestarts);
			if (Executing_Arch) {
				SetAttributeString(Proc->id.cluster, Proc->id.proc,
								   ATTR_CKPT_ARCH, Executing_Arch);
			}
			if (Executing_OpSys) {
				SetAttributeString(Proc->id.cluster, Proc->id.proc,
								   ATTR_CKPT_OPSYS, Executing_OpSys);
			}
				// If we wrote a checkpoint, store the location in the
				// LastCkptServer attribute.  If we didn't use a checkpoint
				// server (i.e., we stored it locally), then make sure
				// no LastCkptServer attribute is set.
			if (LastCkptServer) {
				SetAttributeString(Proc->id.cluster, Proc->id.proc,
								   ATTR_LAST_CKPT_SERVER, LastCkptServer);
			} else {
				DeleteAttribute(Proc->id.cluster, Proc->id.proc,
								   ATTR_LAST_CKPT_SERVER);
			}

			if (LastCkptPlatform) {
				SetAttributeString(Proc->id.cluster, Proc->id.proc,
								   ATTR_LAST_CHECKPOINT_PLATFORM, 
								   LastCkptPlatform);
			}
		}
		// if the job completed, we should include the run-time in
		// committed time, since it contributed to the completion of
		// the job. Also, commit the exit code/signal stuff, plus any 
		// core filenames.
		if (Proc->status == COMPLETED) {
			int exit_code, exit_signal, exit_by_signal;
			int pending;

			// update the time.
			CommittedTime = 0;
			GetAttributeInt(Proc->id.cluster, Proc->id.proc,
							ATTR_JOB_COMMITTED_TIME, &CommittedTime);
			CommittedTime += Proc->completion_date - LastRestartTime;
			SetAttributeInt(Proc->id.cluster, Proc->id.proc,
							ATTR_JOB_COMMITTED_TIME, CommittedTime);

			// if there is a core file, update that too.
			if (JobAd->LookupString(ATTR_JOB_CORE_FILENAME, buf, sizeof(buf))) {
				SetAttributeString(Proc->id.cluster, Proc->id.proc,
			   		ATTR_JOB_CORE_FILENAME, buf);
			}

			// only new style ads have ATTR_ON_EXIT_BY_SIGNAL, so only
			// SetAttribute for those types of ads
			if (JobAd->LookupInteger(ATTR_ON_EXIT_BY_SIGNAL, exit_by_signal)==1)
			{
				SetAttributeInt(Proc->id.cluster, Proc->id.proc,
			   		ATTR_ON_EXIT_BY_SIGNAL, exit_by_signal);

				if (exit_by_signal == 1) /* exited via signal */
				{
					JobAd->LookupInteger(ATTR_ON_EXIT_SIGNAL, exit_signal);
					SetAttributeInt(Proc->id.cluster, Proc->id.proc,
						   			ATTR_ON_EXIT_SIGNAL, exit_signal);
				}
				else
				{
					JobAd->LookupInteger(ATTR_ON_EXIT_CODE, exit_code);
					SetAttributeInt(Proc->id.cluster, Proc->id.proc,
						   			ATTR_ON_EXIT_CODE, exit_code);
				}
			}

			// and now, let's try and mark this job as a terminate pending
			// job. If the job already is, then fine. We'll mark it again.
			if (JobAd->LookupBool(ATTR_TERMINATION_PENDING, pending)) {
				SetAttribute(Proc->id.cluster, Proc->id.proc,
			   			ATTR_TERMINATION_PENDING, pending?"TRUE":"FALSE");
			} else {
				// if it isn't in the job ad, then add it to the saved ad in the
				// schedd.
				SetAttribute(Proc->id.cluster, Proc->id.proc,
			   			ATTR_TERMINATION_PENDING, "TRUE");
			}

			// store the reason why the job is marked completed.
			if (JobAd->LookupString(ATTR_TERMINATION_REASON, buf, sizeof(buf))) {
				SetAttributeString(Proc->id.cluster, Proc->id.proc,
				   			ATTR_TERMINATION_REASON, buf);
			}

			// Set up the exit code the shadow was about to exit with to
			// help support the terminate pending "state".
			SetAttributeInt(Proc->id.cluster, Proc->id.proc,
				   			ATTR_TERMINATION_EXITREASON, ExitReason);

			// Put the job status as created by waitpid() into the job ad
			// itself.  This is to implement the terminate_pending feature. It
			// is done like this because EVERYWHERE in this codebase we do
			// stuff like WIFEXITED(JobStatus) and it turns out there are no
			// user level macros to will one of those status values as returned
			// by waitpid() into existance. So, we'll put it directly into the
			// job ad to prevent me having to reimplement a few large functions
			// which deal with JobStatus directly--as it is sadly a global
			// variable.
			SetAttributeInt(Proc->id.cluster, Proc->id.proc,
				   			ATTR_WAITPID_STATUS, JobStatus);

		}
	}


	if (!DisconnectQ(0)) {
		EXCEPT("Failed to commit updated job queue status!");
	}

}