Esempio n. 1
0
static void
slurmdrmaa_job_on_missing( fsd_job_t *self )
{

	fsd_log_enter(( "({job_id=%s})", self->job_id ));
	fsd_log_warning(( "Job %s missing from DRM queue", self->job_id ));

	fsd_log_info(( "job_on_missing: last job_ps: %s (0x%02x)", drmaa_job_ps_to_str(self->state), self->state));

	if( self->state >= DRMAA_PS_RUNNING ) { /*if the job ever entered running state assume finished */
		self->state = DRMAA_PS_DONE;
		self->exit_status = 0;
	}
	else {
		self->state = DRMAA_PS_FAILED; /* otherwise failed */
		self->exit_status = -1;
	}

	fsd_log_info(("job_on_missing evaluation result: state=%d exit_status=%d", self->state, self->exit_status));

	fsd_cond_broadcast( &self->status_cond);
	fsd_cond_broadcast( &self->session->wait_condition );

	fsd_log_return(( "; job_ps=%s, exit_status=%d", drmaa_job_ps_to_str(self->state), self->exit_status ));
}
Esempio n. 2
0
static void 
lsfdrmaa_job_on_missing(fsd_job_t *self)
{
	const bool lately_submitted =
			(self->flags & FSD_JOB_CURRENT_SESSION)
			&& self->state == DRMAA_PS_UNDETERMINED

	&& (time(NULL) - self->submit_time < LSFDRMAA_MAX_QUEUE_SKEW);

	if(lately_submitted){
		/*the job was submitted just now, wait a moment to check DRM queue*/
		return ;
	}

	if(self->session->query_retries > 0 &&   
		self->retry_cnt < self->session->query_retries){
		self->retry_cnt++;
		fsd_log_warning(( "job %s missing from DRM queue", self->job_id ));
		return ;
	}

	/*start a monitor task to collect information for missing job*/
	if(self->session->monitor_task[0] != '\0'){
		self->session->start_monitor_task(self->session, self->job_id);
	}

	fsd_log_warning(( "terminate the missing job %s", self->job_id ));
	self->state = DRMAA_PS_FAILED;
	self->flags &= ~FSD_JOB_STATE_MASK;
	self->flags |= FSD_JOB_ABORTED;
	self->exit_status = ((self->session->missing_jobs_exit_code)&0xff)<<8;
	self->end_time = time(NULL);
	self->last_update_time = time(NULL);
	fsd_cond_broadcast( &self->status_cond ); 
}
Esempio n. 3
0
void
fsd_job_set_signal_all( fsd_job_set_t *self )
{
	fsd_job_t *volatile job = NULL;
	fsd_mutex_t *volatile mutex = & self->mutex;

	fsd_log_enter(( "" ));
	fsd_mutex_lock( mutex );
	TRY
	 {
		volatile size_t i;
		for( i = 0;  i < self->tab_size;  i++ )
			for( job = self->tab[ i ];  job;  job = job->next )
			 {
				fsd_mutex_lock( &job->mutex );
				TRY{ fsd_cond_broadcast( &job->status_cond ); }
				FINALLY{ fsd_mutex_unlock( &job->mutex ); }
				END_TRY
			 }
	 }
	FINALLY
	 { fsd_mutex_unlock( mutex ); }
	END_TRY

	fsd_log_return(( "" ));
}
Esempio n. 4
0
void
fsd_drmaa_session_stop_wait_thread( fsd_drmaa_session_t *self )
{
	volatile int lock_count = 0;
	fsd_log_enter(( "" ));
	fsd_mutex_lock( &self->mutex );
	TRY
	 {
		fsd_log_debug(("started = %d  run_flag = %d", self->wait_thread_started, self->wait_thread_run_flag ));
		if( self->wait_thread_started )
		 {
			self->wait_thread_run_flag = false;
			fsd_log_debug(("started = %d  run_flag = %d", self->wait_thread_started, self->wait_thread_run_flag ));
			fsd_cond_broadcast( &self->wait_condition );
			TRY
			 {
				lock_count = fsd_mutex_unlock_times( &self->mutex );
				fsd_thread_join( self->wait_thread_handle, NULL );
			 }
			FINALLY
			 {
				int i;
				for( i = 0;  i < lock_count;  i++ )
					fsd_mutex_lock( &self->mutex );
			 }
			END_TRY
			self->wait_thread_started = false;
		 }

	 }
Esempio n. 5
0
void
fsd_drmaa_session_release( fsd_drmaa_session_t *self )
{
	fsd_mutex_lock( &self->mutex );
	self->ref_cnt--;
	fsd_assert( self->ref_cnt > 0 );
	if( self->ref_cnt == 1 )
		fsd_cond_broadcast( &self->destroy_condition );
	fsd_mutex_unlock( &self->mutex );
}
Esempio n. 6
0
void
fsd_drmaa_session_destroy( fsd_drmaa_session_t *self )
{
	bool already_destroying = false;

	fsd_log_enter(( "" ));
	fsd_mutex_lock( &self->mutex );
	TRY
	 {
		if( self->destroy_requested )
			already_destroying = true;
		else
		 {
			self->destroy_requested = true;
			fsd_cond_broadcast( &self->wait_condition );
		 }
	 }
	FINALLY
	 { fsd_mutex_unlock( &self->mutex ); }
	END_TRY

	if( already_destroying )
	 { /* XXX: actually it can not happen in current implementation
				when using DRMAA API */
		self->release( self );
		fsd_exc_raise_code( FSD_DRMAA_ERRNO_NO_ACTIVE_SESSION );
	 }

	self->jobs->signal_all( self->jobs );

	fsd_mutex_lock( &self->mutex );
	TRY
	 {
		while( self->ref_cnt > 1 )
			fsd_cond_wait( &self->destroy_condition, &self->mutex );
		fsd_log_debug(("started = %d  run_flag = %d", self->wait_thread_started, self->wait_thread_run_flag ));
		if( self->wait_thread_started )
			self->stop_wait_thread( self );
	 }
	FINALLY
	 { fsd_mutex_unlock( &self->mutex ); }
	END_TRY

	self->destroy_nowait( self );
	fsd_log_return(( "" ));
}
Esempio n. 7
0
void *
fsd_drmaa_session_wait_thread( fsd_drmaa_session_t *self )
{
	struct timespec ts, *next_check = &ts;
        bool volatile locked = false;

	fsd_log_enter(( "" ));
	locked = fsd_mutex_lock( &self->mutex );
	TRY
	 {
		while( self->wait_thread_run_flag )
			TRY
			 {
				fsd_log_debug(( "wait thread: next iteration" ));
				self->update_all_jobs_status( self );
				fsd_cond_broadcast( &self->wait_condition );
				
				fsd_get_time( next_check );
				fsd_ts_add( next_check, &self->pool_delay );
				fsd_cond_timedwait( &self->wait_condition, &self->mutex, next_check );
				
			 }
			EXCEPT_DEFAULT
			 {
				const fsd_exc_t *e = fsd_exc_get();
				fsd_log_error(( "wait thread: <%d:%s>", e->code(e), e->message(e) ));
			 }
			END_TRY
	 }
	FINALLY
	 { 
		if (locked)
			fsd_mutex_unlock( &self->mutex ); 
	 }
	END_TRY

	fsd_log_return(( " =NULL" ));
	return NULL;
}
Esempio n. 8
0
static void
lsfdrmaa_job_read_job_info( fsd_job_t *self, struct jobInfoEnt *job_info )
{
	int status, flags;

	fsd_log_enter(( "" ));
	 {
		int i;
		fsd_log_debug(( "job status of %s updated from %d[%d]",
					self->job_id,
					LSB_ARRAY_JOBID(job_info->jobId),
					LSB_ARRAY_IDX(job_info->jobId) ));
		fsd_log_debug(( "\n  status: 0x%x", job_info->status ));
		fsd_log_debug(( "\n  submitTime: %ld", job_info->submitTime ));
		fsd_log_debug(( "\n  startTime: %ld", job_info->startTime ));
		fsd_log_debug(( "\n  endTime: %ld", job_info->startTime ));
		fsd_log_debug(( "\n  duration: %d", job_info->duration ));
		fsd_log_debug(( "\n  cpuTime: %f", job_info->cpuTime ));
		fsd_log_debug(( "\n  cwd: %s", job_info->cwd ));
		fsd_log_debug(( "\n  fromHost: %s", job_info->fromHost ));
		fsd_log_debug(( "\n  numExHosts: %d", job_info->numExHosts ));
		for( i = 0;  i < job_info->numExHosts;  i++ )
			fsd_log_debug(( "\n  exHosts[%d]: %s", i, job_info->exHosts[i] ));
		fsd_log_debug(( "\n  exitStatus: %d", job_info->exitStatus ));
		fsd_log_debug(( "\n  execCwd: %s", job_info->execCwd ));
		fsd_log_debug(( "\n  runRusage.mem: %d", job_info->runRusage.mem ));
		fsd_log_debug(( "\n  runRusage.swap: %d", job_info->runRusage.swap ));
		fsd_log_debug(( "\n  runRusage.utime: %d", job_info->runRusage.utime ));
		fsd_log_debug(( "\n  runRusage.stime: %d", job_info->runRusage.stime ));
		fsd_log_debug(( "\n  jName: %s", job_info->jName ));
		/* fsd_log_debug(( "\n  execRusage: %s", job_info->execRusage )); */
	 }

	status = job_info->status;

	flags = 0;
	if( status & (JOB_STAT_PEND | JOB_STAT_PSUSP) )
		flags |= FSD_JOB_QUEUED;
	if( status & JOB_STAT_PSUSP )
		flags |= FSD_JOB_HOLD;
	if( status & (JOB_STAT_RUN | JOB_STAT_USUSP | JOB_STAT_SSUSP) )
		flags |= FSD_JOB_RUNNING;
	if( status & (JOB_STAT_USUSP | JOB_STAT_SSUSP) )
		flags |= FSD_JOB_SUSPENDED;
	if( status & (JOB_STAT_DONE | JOB_STAT_EXIT) )
		flags |= FSD_JOB_TERMINATED;
	if( status & (JOB_STAT_EXIT | JOB_STAT_PERR) )
		flags |= FSD_JOB_ABORTED;
	self->flags &= ~(FSD_JOB_STATE_MASK | FSD_JOB_ABORTED);
	self->flags |= flags;

	if( status & (JOB_STAT_WAIT | JOB_STAT_PEND) )
		self->state = DRMAA_PS_QUEUED_ACTIVE;
	else if( status & JOB_STAT_PSUSP )
		self->state = DRMAA_PS_USER_ON_HOLD;
	else if( status & JOB_STAT_RUN )
		self->state = DRMAA_PS_RUNNING;
	else if( status & JOB_STAT_SSUSP )
		self->state = DRMAA_PS_SYSTEM_SUSPENDED;
	else if( status & JOB_STAT_USUSP )
		self->state = DRMAA_PS_USER_SUSPENDED;
	else if( status & JOB_STAT_DONE )
		self->state = DRMAA_PS_DONE;
	else if( status & JOB_STAT_EXIT )
		self->state = DRMAA_PS_FAILED;
	else if( status & JOB_STAT_PDONE )
		self->state = DRMAA_PS_DONE;
	else if( status & JOB_STAT_PERR )
		self->state = DRMAA_PS_FAILED;
	else if( status & JOB_STAT_UNKWN )
		self->state = DRMAA_PS_UNDETERMINED;
	else
		self->state = DRMAA_PS_FAILED;

	self->exit_status = job_info->exitStatus & ~0xff;
	if( (self->exit_status >> 8) == 0  &&  (job_info->status & JOB_STAT_EXIT) )
		self->exit_status |= 0x01;
	self->start_time = job_info->startTime;
	self->end_time = job_info->endTime;
	self->cpu_usage = job_info->cpuTime;
	self->mem_usage = max( self->mem_usage, 1024*job_info->runRusage.mem );
	self->vmem_usage = max( self->vmem_usage, 1024*job_info->runRusage.swap );
	self->walltime = 60*job_info->duration;
	self->n_execution_hosts = job_info->numExHosts;
	if( self->execution_hosts == NULL  &&  job_info->exHosts != NULL )
		self->execution_hosts
			= fsd_explode( (const char*const*)job_info->exHosts, ' ',
					job_info->numExHosts );
	self->last_update_time = time(NULL);
	if( self->state >= DRMAA_PS_DONE )
		fsd_cond_broadcast( &self->status_cond );
	fsd_log_return(( "" ));
}
Esempio n. 9
0
static void
slurmdrmaa_job_update_status( fsd_job_t *self )
{
	job_info_msg_t *job_info = NULL;
	slurmdrmaa_job_t * slurm_self = (slurmdrmaa_job_t *) self;
	fsd_log_enter(( "({job_id=%s})", self->job_id ));

	fsd_mutex_lock( &self->session->drm_connection_mutex );
	TRY
	{
		if ( slurm_load_job( &job_info, fsd_atoi(self->job_id), SHOW_ALL) ) {
			int _slurm_errno = slurm_get_errno();

			if (_slurm_errno == ESLURM_INVALID_JOB_ID) {
				self->on_missing(self);
			} else {
				fsd_exc_raise_fmt(FSD_ERRNO_INTERNAL_ERROR,"slurm_load_jobs error: %s,job_id: %s", slurm_strerror(slurm_get_errno()), self->job_id);
			}
		}
		if (job_info) {
			fsd_log_debug(("state = %d, state_reason = %d", job_info->job_array[0].job_state, job_info->job_array[0].state_reason));
			
			switch(job_info->job_array[0].job_state & JOB_STATE_BASE)
			{

				case JOB_PENDING:
					switch(job_info->job_array[0].state_reason)
					{
						#if SLURM_VERSION_NUMBER >= SLURM_VERSION_NUM(2,2,0)
						case WAIT_HELD_USER:   /* job is held by user */
							fsd_log_debug(("interpreting as DRMAA_PS_USER_ON_HOLD"));
							self->state = DRMAA_PS_USER_ON_HOLD;
							break;
						#endif
						case WAIT_HELD:  /* job is held by administrator */
							fsd_log_debug(("interpreting as DRMAA_PS_SYSTEM_ON_HOLD"));
							self->state = DRMAA_PS_SYSTEM_ON_HOLD;
							break;
						default:
							fsd_log_debug(("interpreting as DRMAA_PS_QUEUED_ACTIVE"));
							self->state = DRMAA_PS_QUEUED_ACTIVE;
					}
					break;
				case JOB_RUNNING:
					fsd_log_debug(("interpreting as DRMAA_PS_RUNNING"));
					self->state = DRMAA_PS_RUNNING;
					break;
				case JOB_SUSPENDED:
					if(slurm_self->user_suspended == true) {
						fsd_log_debug(("interpreting as DRMAA_PS_USER_SUSPENDED"));
						self->state = DRMAA_PS_USER_SUSPENDED;
					} else {
						fsd_log_debug(("interpreting as DRMAA_PS_SYSTEM_SUSPENDED"));
						self->state = DRMAA_PS_SYSTEM_SUSPENDED;
					}
					break;
				case JOB_COMPLETE:
					fsd_log_debug(("interpreting as DRMAA_PS_DONE"));
					self->state = DRMAA_PS_DONE;
					self->exit_status = job_info->job_array[0].exit_code;
					fsd_log_debug(("exit_status = %d -> %d",self->exit_status, WEXITSTATUS(self->exit_status)));
					break;
				case JOB_CANCELLED:
					fsd_log_debug(("interpreting as DRMAA_PS_FAILED (aborted)"));
					self->state = DRMAA_PS_FAILED;
					self->exit_status = -1;
				case JOB_FAILED:
				case JOB_TIMEOUT:
				case JOB_NODE_FAIL:
				#if SLURM_VERSION_NUMBER >= SLURM_VERSION_NUM(2,3,0)
				case JOB_PREEMPTED:
				#endif
					fsd_log_debug(("interpreting as DRMAA_PS_FAILED"));
					self->state = DRMAA_PS_FAILED;
					self->exit_status = job_info->job_array[0].exit_code;
					fsd_log_debug(("exit_status = %d -> %d",self->exit_status, WEXITSTATUS(self->exit_status)));
					break;
				default: /*unknown state */
					fsd_log_error(("Unknown job state: %d. Please send bug report: http://apps.man.poznan.pl/trac/slurm-drmaa", job_info->job_array[0].job_state));
			}

			if (job_info->job_array[0].job_state & JOB_STATE_FLAGS & JOB_COMPLETING) {
				fsd_log_debug(("Epilog completing"));
			}

			if (job_info->job_array[0].job_state & JOB_STATE_FLAGS & JOB_CONFIGURING) {
				fsd_log_debug(("Nodes booting"));
			}

			if (self->exit_status == -1) /* input,output,error path failure etc*/
				self->state = DRMAA_PS_FAILED;

			self->last_update_time = time(NULL);
		
			if( self->state >= DRMAA_PS_DONE ) {
				fsd_log_debug(("exit_status = %d, WEXITSTATUS(exit_status) = %d", self->exit_status, WEXITSTATUS(self->exit_status)));
				fsd_cond_broadcast( &self->status_cond );
			}
		}
	}
	FINALLY
	{
		if(job_info != NULL)
			slurm_free_job_info_msg (job_info);

		fsd_mutex_unlock( &self->session->drm_connection_mutex );
	}
	END_TRY
	
	fsd_log_return(( "" ));
}