Пример #1
0
char *
lsb_jobid2str (LS_LONG_INT jobId)
{
    static  char string[32];

    
    if (LSB_ARRAY_IDX(jobId) == 0) {
	sprintf(string, "%d",  LSB_ARRAY_JOBID(jobId));
    }
    else {
	sprintf(string, "%d[%d]",  LSB_ARRAY_JOBID(jobId),
	    LSB_ARRAY_IDX(jobId));
    }

    return(string);

} 
Пример #2
0
static void
lsfdrmaa_job_control( fsd_job_t *self, int action )
{
	/*
	 * XXX: waiting for job state change was removed
	 * since it is not required for drmaa_control
	 * to return after change completes.
	 */
	lsfdrmaa_job_t *lsf_self = (lsfdrmaa_job_t*)self;
	LS_LONG_INT job_id;
	int signal;

	fsd_log_enter(( "({job_id=%s}, action=%d)", self->job_id, action ));
	job_id = lsf_self->int_job_id;
	switch( action )
	 {
		case DRMAA_CONTROL_SUSPEND:
		case DRMAA_CONTROL_HOLD:
			signal = SIGSTOP;
			break;
		case DRMAA_CONTROL_RESUME:
		case DRMAA_CONTROL_RELEASE:
			signal = SIGCONT;
			break;
		case DRMAA_CONTROL_TERMINATE:
			/* TODO: sending SIGTERM (configurable)? */
			signal = SIGKILL;
			break;
		default:
			fsd_exc_raise_fmt(
					FSD_ERRNO_INVALID_ARGUMENT,
					"job::control: unknown action %d", action );
	 }

	fsd_mutex_lock( &self->session->drm_connection_mutex );
	TRY
	 {
		int rc = lsb_signaljob( lsf_self->int_job_id, signal );
		fsd_log_debug(( "lsb_signaljob( %d[%d], %d ) = %d",
					LSB_ARRAY_JOBID(lsf_self->int_job_id),
					LSB_ARRAY_IDX(lsf_self->int_job_id),
					signal, rc ));
		if( rc < 0 )
			fsd_exc_raise_fmt(
					FSD_ERRNO_INTERNAL_ERROR,
					"job::control: could not send %s to job %s",
					fsd_strsignal( signal ), self->job_id
					);
	 }
	FINALLY
	 {
		fsd_mutex_unlock( &self->session->drm_connection_mutex );
	 }
	END_TRY

	fsd_log_return(( "" ));
}
Пример #3
0
static void
lsfdrmaa_job_update_status( fsd_job_t *self )
{
	lsfdrmaa_job_t *lsf_self = (lsfdrmaa_job_t*)self;
	struct jobInfoEnt *volatile job_info = NULL;
	bool job_in_queue;	
	
	fsd_log_enter(( "({job_id=%s, time_delta=%d})", self->job_id, time(NULL) - self->submit_time ));
	do {
		fsd_mutex_lock( &self->session->drm_connection_mutex );
		TRY
		 {
			int n_records;
			int more;
			char * username = (lsf_self->int_job_id>0)?"all":NULL;

			fsd_log_debug(( "drm connection locked" ));

			n_records = lsb_openjobinfo( lsf_self->int_job_id,
						NULL, username, NULL, NULL, ALL_JOB );
			fsd_log_debug((
						"lsb_openjobinfo( %d[%d], NULL, %s, NULL, NULL, ALL_JOB ) =%d",
						LSB_ARRAY_JOBID(lsf_self->int_job_id),
						LSB_ARRAY_IDX(lsf_self->int_job_id),
						username?username:"******",
						n_records ));
			
						job_in_queue = n_records > 0;
			
						if(!job_in_queue){
				if(!(self->flags & FSD_JOB_CURRENT_SESSION)){
									fsd_exc_raise_code( FSD_DRMAA_ERRNO_INVALID_JOB );
								 }else{/*handling missing job*/
									 self->on_missing(self);
								 }
						}else{
								job_info = lsb_readjobinfo( &more );
				fsd_log_debug(( "lsb_readjobinfo(...) =%p: more=%d",
							(void*)job_info, more ));
				if( job_info == NULL )
					fsd_exc_raise_lsf( "lsb_readjobinfo" );
				lsf_self->read_job_info( self, job_info );
						}
		 }
		FINALLY
		 {
			/* lsfdrmaa_free_job_info( job_info ); */
			lsb_closejobinfo();
			fsd_log_debug(( "lsb_closejobinfo()" ));
			fsd_mutex_unlock( &self->session->drm_connection_mutex );
		 }
		END_TRY

	} while( !job_in_queue );
	fsd_log_return(( "" ));
}
Пример #4
0
static void
lsfdrmaa_job_read_job_info( fsd_job_t *self, struct jobInfoEnt *job_info )
{
	int status, flags;

	fsd_log_enter(( "" ));
	 {
		int i;
		fsd_log_debug(( "job status of %s updated from %d[%d]",
					self->job_id,
					LSB_ARRAY_JOBID(job_info->jobId),
					LSB_ARRAY_IDX(job_info->jobId) ));
		fsd_log_debug(( "\n  status: 0x%x", job_info->status ));
		fsd_log_debug(( "\n  submitTime: %ld", job_info->submitTime ));
		fsd_log_debug(( "\n  startTime: %ld", job_info->startTime ));
		fsd_log_debug(( "\n  endTime: %ld", job_info->startTime ));
		fsd_log_debug(( "\n  duration: %d", job_info->duration ));
		fsd_log_debug(( "\n  cpuTime: %f", job_info->cpuTime ));
		fsd_log_debug(( "\n  cwd: %s", job_info->cwd ));
		fsd_log_debug(( "\n  fromHost: %s", job_info->fromHost ));
		fsd_log_debug(( "\n  numExHosts: %d", job_info->numExHosts ));
		for( i = 0;  i < job_info->numExHosts;  i++ )
			fsd_log_debug(( "\n  exHosts[%d]: %s", i, job_info->exHosts[i] ));
		fsd_log_debug(( "\n  exitStatus: %d", job_info->exitStatus ));
		fsd_log_debug(( "\n  execCwd: %s", job_info->execCwd ));
		fsd_log_debug(( "\n  runRusage.mem: %d", job_info->runRusage.mem ));
		fsd_log_debug(( "\n  runRusage.swap: %d", job_info->runRusage.swap ));
		fsd_log_debug(( "\n  runRusage.utime: %d", job_info->runRusage.utime ));
		fsd_log_debug(( "\n  runRusage.stime: %d", job_info->runRusage.stime ));
		fsd_log_debug(( "\n  jName: %s", job_info->jName ));
		/* fsd_log_debug(( "\n  execRusage: %s", job_info->execRusage )); */
	 }

	status = job_info->status;

	flags = 0;
	if( status & (JOB_STAT_PEND | JOB_STAT_PSUSP) )
		flags |= FSD_JOB_QUEUED;
	if( status & JOB_STAT_PSUSP )
		flags |= FSD_JOB_HOLD;
	if( status & (JOB_STAT_RUN | JOB_STAT_USUSP | JOB_STAT_SSUSP) )
		flags |= FSD_JOB_RUNNING;
	if( status & (JOB_STAT_USUSP | JOB_STAT_SSUSP) )
		flags |= FSD_JOB_SUSPENDED;
	if( status & (JOB_STAT_DONE | JOB_STAT_EXIT) )
		flags |= FSD_JOB_TERMINATED;
	if( status & (JOB_STAT_EXIT | JOB_STAT_PERR) )
		flags |= FSD_JOB_ABORTED;
	self->flags &= ~(FSD_JOB_STATE_MASK | FSD_JOB_ABORTED);
	self->flags |= flags;

	if( status & (JOB_STAT_WAIT | JOB_STAT_PEND) )
		self->state = DRMAA_PS_QUEUED_ACTIVE;
	else if( status & JOB_STAT_PSUSP )
		self->state = DRMAA_PS_USER_ON_HOLD;
	else if( status & JOB_STAT_RUN )
		self->state = DRMAA_PS_RUNNING;
	else if( status & JOB_STAT_SSUSP )
		self->state = DRMAA_PS_SYSTEM_SUSPENDED;
	else if( status & JOB_STAT_USUSP )
		self->state = DRMAA_PS_USER_SUSPENDED;
	else if( status & JOB_STAT_DONE )
		self->state = DRMAA_PS_DONE;
	else if( status & JOB_STAT_EXIT )
		self->state = DRMAA_PS_FAILED;
	else if( status & JOB_STAT_PDONE )
		self->state = DRMAA_PS_DONE;
	else if( status & JOB_STAT_PERR )
		self->state = DRMAA_PS_FAILED;
	else if( status & JOB_STAT_UNKWN )
		self->state = DRMAA_PS_UNDETERMINED;
	else
		self->state = DRMAA_PS_FAILED;

	self->exit_status = job_info->exitStatus & ~0xff;
	if( (self->exit_status >> 8) == 0  &&  (job_info->status & JOB_STAT_EXIT) )
		self->exit_status |= 0x01;
	self->start_time = job_info->startTime;
	self->end_time = job_info->endTime;
	self->cpu_usage = job_info->cpuTime;
	self->mem_usage = max( self->mem_usage, 1024*job_info->runRusage.mem );
	self->vmem_usage = max( self->vmem_usage, 1024*job_info->runRusage.swap );
	self->walltime = 60*job_info->duration;
	self->n_execution_hosts = job_info->numExHosts;
	if( self->execution_hosts == NULL  &&  job_info->exHosts != NULL )
		self->execution_hosts
			= fsd_explode( (const char*const*)job_info->exHosts, ' ',
					job_info->numExHosts );
	self->last_update_time = time(NULL);
	if( self->state >= DRMAA_PS_DONE )
		fsd_cond_broadcast( &self->status_cond );
	fsd_log_return(( "" ));
}