Пример #1
0
void
fsd_drmaa_session_wait_for_job_status_change(
		fsd_drmaa_session_t *self,
		fsd_cond_t *wait_condition,
		fsd_mutex_t *mutex,
		const struct timespec *timeout
		)
{
	struct timespec ts, *next_check = &ts;
	bool status_changed;

	if( timeout )
		fsd_log_enter((
					"(timeout=%ld.%09ld)",
					timeout->tv_sec, timeout->tv_nsec ));
	else
		fsd_log_enter(( "(timeout=(null))" ));
	fsd_get_time( next_check );
	fsd_ts_add( next_check, &self->pool_delay );
	if( timeout  &&  fsd_ts_cmp( timeout, next_check ) < 0 )
		next_check = (struct timespec*)timeout;
	fsd_log_debug(( "wait_for_job_status_change: waiting untill %ld.%09ld",
				next_check->tv_sec, next_check->tv_nsec ));
	status_changed = fsd_cond_timedwait(
			wait_condition, mutex, next_check );
	if( !status_changed  &&  next_check == timeout )
		fsd_exc_raise_code( FSD_DRMAA_ERRNO_EXIT_TIMEOUT );

	fsd_log_return(( ": next_check=%ld.%09ld, status_changed=%d",
				next_check->tv_sec, next_check->tv_nsec,
				(int)status_changed
				));
}
Пример #2
0
void
fsd_job_set_remove( fsd_job_set_t *self, fsd_job_t *job )
{
	fsd_job_t **pjob = NULL;
	uint32_t h;

	fsd_log_enter(( "(job_id=%s)", job->job_id ));
	fsd_mutex_lock( &self->mutex );
	TRY
	 {
		h = hashstr( job->job_id, strlen(job->job_id), 0 );
		h &= self->tab_mask;
		for( pjob = &self->tab[ h ];  *pjob;  pjob = &(*pjob)->next )
		 {
			if( *pjob == job )
				break;
		 }
		if( *pjob )
		 {
			*pjob = (*pjob)->next;
			job->next = NULL;
			self->n_jobs--;
			job->ref_cnt--;
		 }
		else
			fsd_exc_raise_code( FSD_DRMAA_ERRNO_INVALID_JOB );
	 }
	FINALLY
	 { fsd_mutex_unlock( &self->mutex ); }
	END_TRY
	fsd_log_return(( ": job->ref_cnt=%d", job->ref_cnt ));
}
Пример #3
0
static void
slurmdrmaa_job_on_missing( fsd_job_t *self )
{

	fsd_log_enter(( "({job_id=%s})", self->job_id ));
	fsd_log_warning(( "Job %s missing from DRM queue", self->job_id ));

	fsd_log_info(( "job_on_missing: last job_ps: %s (0x%02x)", drmaa_job_ps_to_str(self->state), self->state));

	if( self->state >= DRMAA_PS_RUNNING ) { /*if the job ever entered running state assume finished */
		self->state = DRMAA_PS_DONE;
		self->exit_status = 0;
	}
	else {
		self->state = DRMAA_PS_FAILED; /* otherwise failed */
		self->exit_status = -1;
	}

	fsd_log_info(("job_on_missing evaluation result: state=%d exit_status=%d", self->state, self->exit_status));

	fsd_cond_broadcast( &self->status_cond);
	fsd_cond_broadcast( &self->session->wait_condition );

	fsd_log_return(( "; job_ps=%s, exit_status=%d", drmaa_job_ps_to_str(self->state), self->exit_status ));
}
Пример #4
0
fsd_job_t *
fsd_job_set_get( fsd_job_set_t *self, const char *job_id )
{
	uint32_t h;
	fsd_job_t *job = NULL;

	fsd_log_enter(( "(job_id=%s)", job_id ));
	fsd_mutex_lock( &self->mutex );
	h = hashstr( job_id, strlen(job_id), 0 );
	h &= self->tab_mask;
	for( job = self->tab[ h ];  job;  job = job->next )
		if( !strcmp( job->job_id, job_id ) )
			break;
	if( job )
	 {
		fsd_mutex_lock( &job->mutex );
		fsd_assert( !(job->flags & FSD_JOB_DISPOSED) );
		job->ref_cnt ++;
	 }
	fsd_mutex_unlock( &self->mutex );
	if( job )
		fsd_log_return(( "(job_id=%s) =%p: ref_cnt=%d [lock %s]",
					job_id, (void*)job, job->ref_cnt, job->job_id ));
	else
		fsd_log_return(( "(job_id=%s) =NULL", job_id ));
	return job;
}
Пример #5
0
fsd_job_t *
fsd_job_set_find_terminated( fsd_job_set_t *self )
{
	fsd_job_t *job = NULL;
	size_t i;
	fsd_mutex_t* volatile mutex = & self->mutex;

	fsd_log_enter(( "()" ));
	fsd_mutex_lock( mutex );
	TRY
	 {
		for( i = 0;  i < self->tab_size;  i++ )
			for( job = self->tab[ i ];  job;  job = job->next )
				if( job->state >= DRMAA_PS_DONE )
					goto found;
found:
		if( job )
		 {
			fsd_mutex_lock( &job->mutex );
			fsd_assert( !(job->flags & FSD_JOB_DISPOSED) );
			job->ref_cnt ++;
		 }
	 }
	FINALLY
	 { fsd_mutex_unlock( mutex ); }
	END_TRY
	if( job )
		fsd_log_return(( "() =%p: job_id=%s, ref_cnt=%d [lock %s]",
					(void*)job, job->job_id, job->ref_cnt, job->job_id ));
	else
		fsd_log_return(( "() =%p", (void*)job ));
	return job;
}
Пример #6
0
void
fsd_job_set_signal_all( fsd_job_set_t *self )
{
	fsd_job_t *volatile job = NULL;
	fsd_mutex_t *volatile mutex = & self->mutex;

	fsd_log_enter(( "" ));
	fsd_mutex_lock( mutex );
	TRY
	 {
		volatile size_t i;
		for( i = 0;  i < self->tab_size;  i++ )
			for( job = self->tab[ i ];  job;  job = job->next )
			 {
				fsd_mutex_lock( &job->mutex );
				TRY{ fsd_cond_broadcast( &job->status_cond ); }
				FINALLY{ fsd_mutex_unlock( &job->mutex ); }
				END_TRY
			 }
	 }
	FINALLY
	 { fsd_mutex_unlock( mutex ); }
	END_TRY

	fsd_log_return(( "" ));
}
Пример #7
0
void
fsd_drmaa_session_stop_wait_thread( fsd_drmaa_session_t *self )
{
	volatile int lock_count = 0;
	fsd_log_enter(( "" ));
	fsd_mutex_lock( &self->mutex );
	TRY
	 {
		fsd_log_debug(("started = %d  run_flag = %d", self->wait_thread_started, self->wait_thread_run_flag ));
		if( self->wait_thread_started )
		 {
			self->wait_thread_run_flag = false;
			fsd_log_debug(("started = %d  run_flag = %d", self->wait_thread_started, self->wait_thread_run_flag ));
			fsd_cond_broadcast( &self->wait_condition );
			TRY
			 {
				lock_count = fsd_mutex_unlock_times( &self->mutex );
				fsd_thread_join( self->wait_thread_handle, NULL );
			 }
			FINALLY
			 {
				int i;
				for( i = 0;  i < lock_count;  i++ )
					fsd_mutex_lock( &self->mutex );
			 }
			END_TRY
			self->wait_thread_started = false;
		 }

	 }
static void
fsd_template_set_attr( fsd_template_t *self,
		const char *name, const char *value )
{
	const fsd_attribute_t *attr = NULL;
	fsd_log_enter(("(%s=%s)", name, value));
	if( name == NULL )
		fsd_exc_raise_code( FSD_ERRNO_INVALID_ARGUMENT );
	attr = self->by_name( self, name );
	if( attr == NULL  ||  attr->is_vector )
		fsd_exc_raise_fmt(
				FSD_ERRNO_INVALID_ARGUMENT,
				"invalid scalar attribute name: %s", name
				);
	if( value != NULL ) {
		if (strlen (value) > DRMAA_MAX_ATTR_LEN)
			fsd_exc_raise_fmt(
				FSD_ERRNO_INVALID_ARGUMENT,
				"Argument length exceeds max size: %d > %d", (int)strlen(value), DRMAA_MAX_ATTR_LEN
				);

		if( self->attributes[ attr->code ] != NULL ) {
			fsd_free(self->attributes[ attr->code ]);
		}
		self->attributes[ attr->code ] = fsd_strdup( value );
	}
	else
		self->attributes[ attr->code ] = NULL;
}
Пример #9
0
static void
lsfdrmaa_job_control( fsd_job_t *self, int action )
{
	/*
	 * XXX: waiting for job state change was removed
	 * since it is not required for drmaa_control
	 * to return after change completes.
	 */
	lsfdrmaa_job_t *lsf_self = (lsfdrmaa_job_t*)self;
	LS_LONG_INT job_id;
	int signal;

	fsd_log_enter(( "({job_id=%s}, action=%d)", self->job_id, action ));
	job_id = lsf_self->int_job_id;
	switch( action )
	 {
		case DRMAA_CONTROL_SUSPEND:
		case DRMAA_CONTROL_HOLD:
			signal = SIGSTOP;
			break;
		case DRMAA_CONTROL_RESUME:
		case DRMAA_CONTROL_RELEASE:
			signal = SIGCONT;
			break;
		case DRMAA_CONTROL_TERMINATE:
			/* TODO: sending SIGTERM (configurable)? */
			signal = SIGKILL;
			break;
		default:
			fsd_exc_raise_fmt(
					FSD_ERRNO_INVALID_ARGUMENT,
					"job::control: unknown action %d", action );
	 }

	fsd_mutex_lock( &self->session->drm_connection_mutex );
	TRY
	 {
		int rc = lsb_signaljob( lsf_self->int_job_id, signal );
		fsd_log_debug(( "lsb_signaljob( %d[%d], %d ) = %d",
					LSB_ARRAY_JOBID(lsf_self->int_job_id),
					LSB_ARRAY_IDX(lsf_self->int_job_id),
					signal, rc ));
		if( rc < 0 )
			fsd_exc_raise_fmt(
					FSD_ERRNO_INTERNAL_ERROR,
					"job::control: could not send %s to job %s",
					fsd_strsignal( signal ), self->job_id
					);
	 }
	FINALLY
	 {
		fsd_mutex_unlock( &self->session->drm_connection_mutex );
	 }
	END_TRY

	fsd_log_return(( "" ));
}
Пример #10
0
static void
lsfdrmaa_job_update_status( fsd_job_t *self )
{
	lsfdrmaa_job_t *lsf_self = (lsfdrmaa_job_t*)self;
	struct jobInfoEnt *volatile job_info = NULL;
	bool job_in_queue;	
	
	fsd_log_enter(( "({job_id=%s, time_delta=%d})", self->job_id, time(NULL) - self->submit_time ));
	do {
		fsd_mutex_lock( &self->session->drm_connection_mutex );
		TRY
		 {
			int n_records;
			int more;
			char * username = (lsf_self->int_job_id>0)?"all":NULL;

			fsd_log_debug(( "drm connection locked" ));

			n_records = lsb_openjobinfo( lsf_self->int_job_id,
						NULL, username, NULL, NULL, ALL_JOB );
			fsd_log_debug((
						"lsb_openjobinfo( %d[%d], NULL, %s, NULL, NULL, ALL_JOB ) =%d",
						LSB_ARRAY_JOBID(lsf_self->int_job_id),
						LSB_ARRAY_IDX(lsf_self->int_job_id),
						username?username:"******",
						n_records ));
			
						job_in_queue = n_records > 0;
			
						if(!job_in_queue){
				if(!(self->flags & FSD_JOB_CURRENT_SESSION)){
									fsd_exc_raise_code( FSD_DRMAA_ERRNO_INVALID_JOB );
								 }else{/*handling missing job*/
									 self->on_missing(self);
								 }
						}else{
								job_info = lsb_readjobinfo( &more );
				fsd_log_debug(( "lsb_readjobinfo(...) =%p: more=%d",
							(void*)job_info, more ));
				if( job_info == NULL )
					fsd_exc_raise_lsf( "lsb_readjobinfo" );
				lsf_self->read_job_info( self, job_info );
						}
		 }
		FINALLY
		 {
			/* lsfdrmaa_free_job_info( job_info ); */
			lsb_closejobinfo();
			fsd_log_debug(( "lsb_closejobinfo()" ));
			fsd_mutex_unlock( &self->session->drm_connection_mutex );
		 }
		END_TRY

	} while( !job_in_queue );
	fsd_log_return(( "" ));
}
Пример #11
0
void
fsd_job_destroy( fsd_job_t *self )
{
	fsd_log_enter(( "(%p={job_id=%s})", (void*)self, self->job_id ));
	fsd_cond_destroy( &self->status_cond );
	fsd_cond_destroy( &self->destroy_cond );
	fsd_mutex_destroy( &self->mutex );
	fsd_free( self->job_id );
	fsd_free( self );
	fsd_log_return(( "" ));
}
Пример #12
0
void
fsd_job_release( fsd_job_t *self )
{
	bool destroy;
	fsd_log_enter(( "(%p={job_id=%s, ref_cnt=%d}) [unlock %s]",
				(void*)self, self->job_id, self->ref_cnt, self->job_id ));
	fsd_assert( self->ref_cnt > 0 );
	destroy = ( --(self->ref_cnt) == 0 );
	fsd_mutex_unlock( &self->mutex );
	if( destroy )
		self->destroy( self );
	fsd_log_return(( "" ));
}
Пример #13
0
fsd_job_t *
fsd_job_new( char *job_id )
{
	fsd_job_t *volatile self = NULL;
	fsd_log_enter(( "(%s)", job_id ));
	TRY
	 {
		fsd_malloc( self, fsd_job_t );
		self->release = fsd_job_release;
		self->destroy = fsd_job_destroy;
		self->control = fsd_job_control;
		self->update_status = fsd_job_update_status;
		self->get_termination_status = fsd_job_get_termination_status;
		self->on_missing = fsd_job_on_missing;
		self->next              = NULL;
		self->ref_cnt           = 1;
		self->job_id            = job_id;
		self->session           = NULL;
		self->last_update_time  = 0;
		self->flags             = 0;
		self->state             = DRMAA_PS_UNDETERMINED;
		self->exit_status       = 0;
		self->submit_time       = 0;
		self->start_time        = 0;
		self->end_time          = 0;
		self->cpu_usage         = 0;
		self->mem_usage         = 0;
		self->vmem_usage        = 0;
		self->walltime          = 0;
		self->n_execution_hosts = 0;
		self->execution_hosts   = NULL;
                self->retry_cnt          = 0;
		fsd_mutex_init( &self->mutex );
		fsd_cond_init( &self->status_cond );
		fsd_cond_init( &self->destroy_cond );
		fsd_mutex_lock( &self->mutex );
	 }
	EXCEPT_DEFAULT
	 {
		if( self )
			self->destroy( self );
		else
			fsd_free( job_id );
		fsd_exc_reraise();
	 }
	END_TRY
	fsd_log_return(( "=%p: ref_cnt=%d [lock %s]",
				(void*)self, self->ref_cnt, self->job_id ));
	return self;
}
void
fsd_job_destroy( fsd_job_t *self )
{
	fsd_log_enter(( "(%p={job_id=%s})", (void*)self, self->job_id ));
	fsd_cond_destroy( &self->status_cond );
	fsd_cond_destroy( &self->destroy_cond );
	fsd_mutex_destroy( &self->mutex );
	fsd_free( self->job_id );
	fsd_free( self->execution_hosts );
	fsd_free( self->queue );
	fsd_free( self->project );
	fsd_free( self );
	fsd_log_return(( "" ));
}
Пример #15
0
void
fsd_job_set_add( fsd_job_set_t *self, fsd_job_t *job )
{
	uint32_t h;
	fsd_log_enter(( "(job=%p, job_id=%s)", (void*)job, job->job_id ));
	fsd_mutex_lock( &self->mutex );
	h = hashstr( job->job_id, strlen(job->job_id), 0 );
	h &= self->tab_mask;
	job->next = self->tab[ h ];
	self->tab[ h ] = job;
	self->n_jobs++;
	job->ref_cnt++;
	fsd_mutex_unlock( &self->mutex );
	fsd_log_return(( ": job->ref_cnt=%d", job->ref_cnt ));
}
Пример #16
0
void
fsd_drmaa_session_synchronize(
		fsd_drmaa_session_t *self,
		const char **input_job_ids, const struct timespec *timeout,
		bool dispose
		)
{
	volatile bool wait_for_all = false;
	char **volatile job_ids_buf = NULL;
	const char **job_ids = NULL;
	const char **i;

	fsd_log_enter(( "(job_ids={...}, timeout=..., dispose=%d)",
			(int)dispose ));

	if( input_job_ids == NULL )
		fsd_exc_raise_code( FSD_ERRNO_INVALID_ARGUMENT );

	TRY
	 {
		for( i = input_job_ids;  *i != NULL;  i++ )
			if( !strcmp(*i, DRMAA_JOB_IDS_SESSION_ALL) )
				wait_for_all = true;

		if( wait_for_all )
		 {
			job_ids_buf = self->get_submited_job_ids( self );
			job_ids = (const char**)job_ids_buf;
		 }
		else
			job_ids = input_job_ids;

		for( i = job_ids;  *i != NULL;  i++ )
			TRY
			 {
				self->wait_for_single_job( self, *i, timeout, NULL, NULL, dispose );
			 }
			EXCEPT( FSD_DRMAA_ERRNO_INVALID_JOB )
			 { /* job was ripped by another thread */ }
			END_TRY
	 }
	FINALLY
	 {
		fsd_free_vector( job_ids_buf );
	 }
	END_TRY
}
Пример #17
0
void
fsd_drmaa_session_destroy( fsd_drmaa_session_t *self )
{
	bool already_destroying = false;

	fsd_log_enter(( "" ));
	fsd_mutex_lock( &self->mutex );
	TRY
	 {
		if( self->destroy_requested )
			already_destroying = true;
		else
		 {
			self->destroy_requested = true;
			fsd_cond_broadcast( &self->wait_condition );
		 }
	 }
	FINALLY
	 { fsd_mutex_unlock( &self->mutex ); }
	END_TRY

	if( already_destroying )
	 { /* XXX: actually it can not happen in current implementation
				when using DRMAA API */
		self->release( self );
		fsd_exc_raise_code( FSD_DRMAA_ERRNO_NO_ACTIVE_SESSION );
	 }

	self->jobs->signal_all( self->jobs );

	fsd_mutex_lock( &self->mutex );
	TRY
	 {
		while( self->ref_cnt > 1 )
			fsd_cond_wait( &self->destroy_condition, &self->mutex );
		fsd_log_debug(("started = %d  run_flag = %d", self->wait_thread_started, self->wait_thread_run_flag ));
		if( self->wait_thread_started )
			self->stop_wait_thread( self );
	 }
	FINALLY
	 { fsd_mutex_unlock( &self->mutex ); }
	END_TRY

	self->destroy_nowait( self );
	fsd_log_return(( "" ));
}
Пример #18
0
void
fsd_drmaa_session_destroy_nowait( fsd_drmaa_session_t *self )
{
	fsd_log_enter(( "" ));
	fsd_conf_dict_destroy( self->configuration );
	fsd_free( self->contact );

	if( self->jobs )
		self->jobs->destroy( self->jobs );

	fsd_mutex_destroy( &self->mutex );
	fsd_cond_destroy( &self->wait_condition );
	fsd_cond_destroy( &self->destroy_condition );
	fsd_mutex_destroy( &self->drm_connection_mutex );

	fsd_free( self );
	fsd_log_return(( "" ));
}
Пример #19
0
void
fsd_job_set_destroy( fsd_job_set_t *self )
{
	unsigned i;
	fsd_job_t *j;

	fsd_log_enter(( "()" ));
	for( i = 0;  i < self->tab_size;  i++ )
		for( j = self->tab[i];  j != NULL;  )
		 {
			fsd_job_t *job = j;
			j = j->next;
			fsd_mutex_lock( &job->mutex );
			job->release( job );
		 }
	fsd_free( self->tab );
	fsd_free( self );
	fsd_log_return(( "" ));
}
Пример #20
0
fsd_job_set_t *
fsd_job_set_new(void)
{
	fsd_job_set_t *volatile self = NULL;
	const size_t initial_size = 1024;

	fsd_log_enter(( "()" ));
	TRY
	 {
		fsd_malloc( self, fsd_job_set_t );
		self->destroy = fsd_job_set_destroy;
		self->add = fsd_job_set_add;
		self->remove = fsd_job_set_remove;
		self->get = fsd_job_set_get;
		self->empty = fsd_job_set_empty;
		self->find_terminated = fsd_job_set_find_terminated;
		self->get_all_job_ids = fsd_job_set_get_all_job_ids;
		self->signal_all = fsd_job_set_signal_all;
		self->tab = NULL;
		self->n_jobs = 0;
		fsd_calloc( self->tab, initial_size, fsd_job_t* );
		self->tab_size = initial_size;
		self->tab_mask = self->tab_size - 1;
		fsd_mutex_init( &self->mutex );
	 }
	EXCEPT_DEFAULT
	 {
		if( self )
		 {
			fsd_free( self->tab );
			fsd_free( self );
		 }
		fsd_exc_reraise();
	 }
	END_TRY

	fsd_log_return(( " =%p", (void*)self ));
	return self;
}
Пример #21
0
void *
fsd_drmaa_session_wait_thread( fsd_drmaa_session_t *self )
{
	struct timespec ts, *next_check = &ts;
        bool volatile locked = false;

	fsd_log_enter(( "" ));
	locked = fsd_mutex_lock( &self->mutex );
	TRY
	 {
		while( self->wait_thread_run_flag )
			TRY
			 {
				fsd_log_debug(( "wait thread: next iteration" ));
				self->update_all_jobs_status( self );
				fsd_cond_broadcast( &self->wait_condition );
				
				fsd_get_time( next_check );
				fsd_ts_add( next_check, &self->pool_delay );
				fsd_cond_timedwait( &self->wait_condition, &self->mutex, next_check );
				
			 }
			EXCEPT_DEFAULT
			 {
				const fsd_exc_t *e = fsd_exc_get();
				fsd_log_error(( "wait thread: <%d:%s>", e->code(e), e->message(e) ));
			 }
			END_TRY
	 }
	FINALLY
	 { 
		if (locked)
			fsd_mutex_unlock( &self->mutex ); 
	 }
	END_TRY

	fsd_log_return(( " =NULL" ));
	return NULL;
}
Пример #22
0
fsd_drmaa_session_t *
fsd_drmaa_session_new( const char *contact )
{
	fsd_drmaa_session_t *volatile self = NULL;

	fsd_log_enter(( "(%s)", contact ));
	TRY
	 {
		fsd_malloc( self, fsd_drmaa_session_t );

		self->release = fsd_drmaa_session_release;
		self->destroy = fsd_drmaa_session_destroy;
		self->destroy_nowait = fsd_drmaa_session_destroy_nowait;
		self->run_job = fsd_drmaa_session_run_job;
		self->run_bulk = fsd_drmaa_session_run_bulk;
		self->control_job = fsd_drmaa_session_control_job;
		self->job_ps = fsd_drmaa_session_job_ps;
		self->synchronize = fsd_drmaa_session_synchronize;
		self->wait = fsd_drmaa_session_wait;
		self->new_job = fsd_drmaa_session_new_job;
		self->run_impl = fsd_drmaa_session_run_impl;
		self->wait_for_single_job = fsd_drmaa_session_wait_for_single_job;
		self->wait_for_any_job = fsd_drmaa_session_wait_for_any_job;
		self->wait_for_job_status_change =
			fsd_drmaa_session_wait_for_job_status_change;
		self->wait_thread = fsd_drmaa_session_wait_thread;
		self->stop_wait_thread = fsd_drmaa_session_stop_wait_thread;
		self->update_all_jobs_status = fsd_drmaa_session_update_all_jobs_status;
		self->get_submited_job_ids = fsd_drmaa_session_get_submited_job_ids;
		self->get_job = fsd_drmaa_session_get_job;
		self->load_configuration = fsd_drmaa_session_load_configuration;
		self->read_configuration = fsd_drmaa_session_read_configuration;
		self->apply_configuration = fsd_drmaa_session_apply_configuration;

		self->ref_cnt = 1;
		self->destroy_requested = false;
		self->contact = NULL;
		self->jobs = NULL;
		self->configuration = NULL;
		self->pool_delay.tv_sec = 5;
		self->pool_delay.tv_nsec = 0;
		self->cache_job_state = 0;
		self->enable_wait_thread = true;
		self->job_categories = NULL;
		self->missing_jobs = FSD_REVEAL_MISSING_JOBS;
		self->wait_thread_started = false;
		self->wait_thread_run_flag = false;

		fsd_mutex_init( &self->mutex );
		fsd_cond_init( &self->wait_condition );
		fsd_cond_init( &self->destroy_condition );
		fsd_mutex_init( &self->drm_connection_mutex );
		self->jobs = fsd_job_set_new();
		self->contact = fsd_strdup( contact );
	 }
	EXCEPT_DEFAULT
	 {
		if( self != NULL )
			self->destroy( self );
		fsd_exc_reraise();
	 }
	END_TRY

	fsd_log_debug(("sizeof(fsd_drmaa_session_t)=%d", sizeof(fsd_drmaa_session_t)));
	return self;
}
Пример #23
0
char *
fsd_drmaa_session_wait_for_any_job(
		fsd_drmaa_session_t *self,
		const struct timespec *timeout,
		int *status, fsd_iter_t **rusage,
		bool dispose
		)
{
	fsd_job_set_t *set = self->jobs;
	fsd_job_t *volatile job = NULL;
	char *volatile job_id = NULL;
	volatile bool locked = false;

	fsd_log_enter(( "" ));

	TRY
	 {
		while( job == NULL )
		 {
			bool signaled = true;

			if( self->destroy_requested )
				fsd_exc_raise_code( FSD_DRMAA_ERRNO_NO_ACTIVE_SESSION );

			if( !self->enable_wait_thread )
				self->update_all_jobs_status( self );

			locked = fsd_mutex_lock( &self->mutex );
			if( set->empty( set ) )
				fsd_exc_raise_msg( FSD_DRMAA_ERRNO_INVALID_JOB,
						"No job found to be waited for" );

			if( (job = set->find_terminated( set )) != NULL )
				break;

			if( self->destroy_requested )
				fsd_exc_raise_code( FSD_DRMAA_ERRNO_NO_ACTIVE_SESSION );
			if( self->enable_wait_thread )
			 {
				fsd_log_debug(( "wait_for_any_job: waiting for wait thread" ));
				if( timeout )
					signaled = fsd_cond_timedwait(
							&self->wait_condition, &self->mutex, timeout );
				else
					fsd_cond_wait( &self->wait_condition, &self->mutex );
			 }
			else
			 {
				fsd_log_debug(( "wait_for_any_job: waiting for next check" ));
				self->wait_for_job_status_change( self,
						&self->wait_condition, &self->mutex, timeout );
			 }
			locked = fsd_mutex_unlock( &self->mutex );
			fsd_log_debug((
						"wait_for_any_job: woken up; signaled=%d", signaled ));

			if( !signaled )
				fsd_exc_raise_code( FSD_DRMAA_ERRNO_EXIT_TIMEOUT );

		 }
		fsd_log_debug(( "wait_for_any_job: waiting finished" ));

		job_id = fsd_strdup( job->job_id );
		job->get_termination_status( job, status, rusage );
	 }
	EXCEPT_DEFAULT
	 {
		if( job_id )
			fsd_free( job_id );
		fsd_exc_reraise();
	 }
	FINALLY
	 {
		if( job )
		 {
			if( fsd_exc_get() == NULL  &&  dispose )
			 {
				set->remove( set, job );
				job->flags |= FSD_JOB_DISPOSED;
			 }
			job->release( job );
		 }
		if( locked )
			fsd_mutex_unlock( &self->mutex );
	 }
	END_TRY

	fsd_log_return(( " =%s", job_id ));
	return job_id;
}
Пример #24
0
static void
slurmdrmaa_job_update_status( fsd_job_t *self )
{
	job_info_msg_t *job_info = NULL;
	slurmdrmaa_job_t * slurm_self = (slurmdrmaa_job_t *) self;
	fsd_log_enter(( "({job_id=%s})", self->job_id ));

	fsd_mutex_lock( &self->session->drm_connection_mutex );
	TRY
	{
		if ( slurm_load_job( &job_info, fsd_atoi(self->job_id), SHOW_ALL) ) {
			int _slurm_errno = slurm_get_errno();

			if (_slurm_errno == ESLURM_INVALID_JOB_ID) {
				self->on_missing(self);
			} else {
				fsd_exc_raise_fmt(FSD_ERRNO_INTERNAL_ERROR,"slurm_load_jobs error: %s,job_id: %s", slurm_strerror(slurm_get_errno()), self->job_id);
			}
		}
		if (job_info) {
			fsd_log_debug(("state = %d, state_reason = %d", job_info->job_array[0].job_state, job_info->job_array[0].state_reason));
			
			switch(job_info->job_array[0].job_state & JOB_STATE_BASE)
			{

				case JOB_PENDING:
					switch(job_info->job_array[0].state_reason)
					{
						#if SLURM_VERSION_NUMBER >= SLURM_VERSION_NUM(2,2,0)
						case WAIT_HELD_USER:   /* job is held by user */
							fsd_log_debug(("interpreting as DRMAA_PS_USER_ON_HOLD"));
							self->state = DRMAA_PS_USER_ON_HOLD;
							break;
						#endif
						case WAIT_HELD:  /* job is held by administrator */
							fsd_log_debug(("interpreting as DRMAA_PS_SYSTEM_ON_HOLD"));
							self->state = DRMAA_PS_SYSTEM_ON_HOLD;
							break;
						default:
							fsd_log_debug(("interpreting as DRMAA_PS_QUEUED_ACTIVE"));
							self->state = DRMAA_PS_QUEUED_ACTIVE;
					}
					break;
				case JOB_RUNNING:
					fsd_log_debug(("interpreting as DRMAA_PS_RUNNING"));
					self->state = DRMAA_PS_RUNNING;
					break;
				case JOB_SUSPENDED:
					if(slurm_self->user_suspended == true) {
						fsd_log_debug(("interpreting as DRMAA_PS_USER_SUSPENDED"));
						self->state = DRMAA_PS_USER_SUSPENDED;
					} else {
						fsd_log_debug(("interpreting as DRMAA_PS_SYSTEM_SUSPENDED"));
						self->state = DRMAA_PS_SYSTEM_SUSPENDED;
					}
					break;
				case JOB_COMPLETE:
					fsd_log_debug(("interpreting as DRMAA_PS_DONE"));
					self->state = DRMAA_PS_DONE;
					self->exit_status = job_info->job_array[0].exit_code;
					fsd_log_debug(("exit_status = %d -> %d",self->exit_status, WEXITSTATUS(self->exit_status)));
					break;
				case JOB_CANCELLED:
					fsd_log_debug(("interpreting as DRMAA_PS_FAILED (aborted)"));
					self->state = DRMAA_PS_FAILED;
					self->exit_status = -1;
				case JOB_FAILED:
				case JOB_TIMEOUT:
				case JOB_NODE_FAIL:
				#if SLURM_VERSION_NUMBER >= SLURM_VERSION_NUM(2,3,0)
				case JOB_PREEMPTED:
				#endif
					fsd_log_debug(("interpreting as DRMAA_PS_FAILED"));
					self->state = DRMAA_PS_FAILED;
					self->exit_status = job_info->job_array[0].exit_code;
					fsd_log_debug(("exit_status = %d -> %d",self->exit_status, WEXITSTATUS(self->exit_status)));
					break;
				default: /*unknown state */
					fsd_log_error(("Unknown job state: %d. Please send bug report: http://apps.man.poznan.pl/trac/slurm-drmaa", job_info->job_array[0].job_state));
			}

			if (job_info->job_array[0].job_state & JOB_STATE_FLAGS & JOB_COMPLETING) {
				fsd_log_debug(("Epilog completing"));
			}

			if (job_info->job_array[0].job_state & JOB_STATE_FLAGS & JOB_CONFIGURING) {
				fsd_log_debug(("Nodes booting"));
			}

			if (self->exit_status == -1) /* input,output,error path failure etc*/
				self->state = DRMAA_PS_FAILED;

			self->last_update_time = time(NULL);
		
			if( self->state >= DRMAA_PS_DONE ) {
				fsd_log_debug(("exit_status = %d, WEXITSTATUS(exit_status) = %d", self->exit_status, WEXITSTATUS(self->exit_status)));
				fsd_cond_broadcast( &self->status_cond );
			}
		}
	}
	FINALLY
	{
		if(job_info != NULL)
			slurm_free_job_info_msg (job_info);

		fsd_mutex_unlock( &self->session->drm_connection_mutex );
	}
	END_TRY
	
	fsd_log_return(( "" ));
}
Пример #25
0
static void
lsfdrmaa_job_read_job_info( fsd_job_t *self, struct jobInfoEnt *job_info )
{
	int status, flags;

	fsd_log_enter(( "" ));
	 {
		int i;
		fsd_log_debug(( "job status of %s updated from %d[%d]",
					self->job_id,
					LSB_ARRAY_JOBID(job_info->jobId),
					LSB_ARRAY_IDX(job_info->jobId) ));
		fsd_log_debug(( "\n  status: 0x%x", job_info->status ));
		fsd_log_debug(( "\n  submitTime: %ld", job_info->submitTime ));
		fsd_log_debug(( "\n  startTime: %ld", job_info->startTime ));
		fsd_log_debug(( "\n  endTime: %ld", job_info->startTime ));
		fsd_log_debug(( "\n  duration: %d", job_info->duration ));
		fsd_log_debug(( "\n  cpuTime: %f", job_info->cpuTime ));
		fsd_log_debug(( "\n  cwd: %s", job_info->cwd ));
		fsd_log_debug(( "\n  fromHost: %s", job_info->fromHost ));
		fsd_log_debug(( "\n  numExHosts: %d", job_info->numExHosts ));
		for( i = 0;  i < job_info->numExHosts;  i++ )
			fsd_log_debug(( "\n  exHosts[%d]: %s", i, job_info->exHosts[i] ));
		fsd_log_debug(( "\n  exitStatus: %d", job_info->exitStatus ));
		fsd_log_debug(( "\n  execCwd: %s", job_info->execCwd ));
		fsd_log_debug(( "\n  runRusage.mem: %d", job_info->runRusage.mem ));
		fsd_log_debug(( "\n  runRusage.swap: %d", job_info->runRusage.swap ));
		fsd_log_debug(( "\n  runRusage.utime: %d", job_info->runRusage.utime ));
		fsd_log_debug(( "\n  runRusage.stime: %d", job_info->runRusage.stime ));
		fsd_log_debug(( "\n  jName: %s", job_info->jName ));
		/* fsd_log_debug(( "\n  execRusage: %s", job_info->execRusage )); */
	 }

	status = job_info->status;

	flags = 0;
	if( status & (JOB_STAT_PEND | JOB_STAT_PSUSP) )
		flags |= FSD_JOB_QUEUED;
	if( status & JOB_STAT_PSUSP )
		flags |= FSD_JOB_HOLD;
	if( status & (JOB_STAT_RUN | JOB_STAT_USUSP | JOB_STAT_SSUSP) )
		flags |= FSD_JOB_RUNNING;
	if( status & (JOB_STAT_USUSP | JOB_STAT_SSUSP) )
		flags |= FSD_JOB_SUSPENDED;
	if( status & (JOB_STAT_DONE | JOB_STAT_EXIT) )
		flags |= FSD_JOB_TERMINATED;
	if( status & (JOB_STAT_EXIT | JOB_STAT_PERR) )
		flags |= FSD_JOB_ABORTED;
	self->flags &= ~(FSD_JOB_STATE_MASK | FSD_JOB_ABORTED);
	self->flags |= flags;

	if( status & (JOB_STAT_WAIT | JOB_STAT_PEND) )
		self->state = DRMAA_PS_QUEUED_ACTIVE;
	else if( status & JOB_STAT_PSUSP )
		self->state = DRMAA_PS_USER_ON_HOLD;
	else if( status & JOB_STAT_RUN )
		self->state = DRMAA_PS_RUNNING;
	else if( status & JOB_STAT_SSUSP )
		self->state = DRMAA_PS_SYSTEM_SUSPENDED;
	else if( status & JOB_STAT_USUSP )
		self->state = DRMAA_PS_USER_SUSPENDED;
	else if( status & JOB_STAT_DONE )
		self->state = DRMAA_PS_DONE;
	else if( status & JOB_STAT_EXIT )
		self->state = DRMAA_PS_FAILED;
	else if( status & JOB_STAT_PDONE )
		self->state = DRMAA_PS_DONE;
	else if( status & JOB_STAT_PERR )
		self->state = DRMAA_PS_FAILED;
	else if( status & JOB_STAT_UNKWN )
		self->state = DRMAA_PS_UNDETERMINED;
	else
		self->state = DRMAA_PS_FAILED;

	self->exit_status = job_info->exitStatus & ~0xff;
	if( (self->exit_status >> 8) == 0  &&  (job_info->status & JOB_STAT_EXIT) )
		self->exit_status |= 0x01;
	self->start_time = job_info->startTime;
	self->end_time = job_info->endTime;
	self->cpu_usage = job_info->cpuTime;
	self->mem_usage = max( self->mem_usage, 1024*job_info->runRusage.mem );
	self->vmem_usage = max( self->vmem_usage, 1024*job_info->runRusage.swap );
	self->walltime = 60*job_info->duration;
	self->n_execution_hosts = job_info->numExHosts;
	if( self->execution_hosts == NULL  &&  job_info->exHosts != NULL )
		self->execution_hosts
			= fsd_explode( (const char*const*)job_info->exHosts, ' ',
					job_info->numExHosts );
	self->last_update_time = time(NULL);
	if( self->state >= DRMAA_PS_DONE )
		fsd_cond_broadcast( &self->status_cond );
	fsd_log_return(( "" ));
}
Пример #26
0
static void
slurmdrmaa_job_control( fsd_job_t *self, int action )
{
	slurmdrmaa_job_t *slurm_self = (slurmdrmaa_job_t*)self;
	job_desc_msg_t job_desc;

	fsd_log_enter(( "({job_id=%s}, action=%d)", self->job_id, action ));

	fsd_mutex_lock( &self->session->drm_connection_mutex );
	TRY
	 {
		switch( action )
		 {
			case DRMAA_CONTROL_SUSPEND:
				if(slurm_suspend(fsd_atoi(self->job_id)) == -1) {
					fsd_exc_raise_fmt(	FSD_ERRNO_INTERNAL_ERROR,"slurm_suspend error: %s,job_id: %s",slurm_strerror(slurm_get_errno()),self->job_id);
				}
				slurm_self->user_suspended = true;
				break;
			case DRMAA_CONTROL_HOLD:
				/* change priority to 0*/
				slurm_init_job_desc_msg(&job_desc);
				slurm_self->old_priority = job_desc.priority;
				job_desc.job_id = atoi(self->job_id);
				job_desc.priority = 0;
				job_desc.alloc_sid = 0;
				if(slurm_update_job(&job_desc) == -1) {
					fsd_exc_raise_fmt(	FSD_ERRNO_INTERNAL_ERROR,"slurm_update_job error: %s,job_id: %s",slurm_strerror(slurm_get_errno()),self->job_id);
				}
				break;
			case DRMAA_CONTROL_RESUME:
				if(slurm_resume(fsd_atoi(self->job_id)) == -1) {
					fsd_exc_raise_fmt(	FSD_ERRNO_INTERNAL_ERROR,"slurm_resume error: %s,job_id: %s",slurm_strerror(slurm_get_errno()),self->job_id);
				}
				slurm_self->user_suspended = false;
				break;
			case DRMAA_CONTROL_RELEASE:
			  /* change priority back*/
			  	slurm_init_job_desc_msg(&job_desc);
				job_desc.priority = INFINITE;
				job_desc.job_id = atoi(self->job_id);
				if(slurm_update_job(&job_desc) == -1) {
					fsd_exc_raise_fmt(	FSD_ERRNO_INTERNAL_ERROR,"slurm_update_job error: %s,job_id: %s",slurm_strerror(slurm_get_errno()),self->job_id);
				}
				break;
			case DRMAA_CONTROL_TERMINATE:
				if(slurm_kill_job(fsd_atoi(self->job_id),SIGKILL,0) == -1) {
					fsd_exc_raise_fmt(	FSD_ERRNO_INTERNAL_ERROR,"slurm_terminate_job error: %s,job_id: %s",slurm_strerror(slurm_get_errno()),self->job_id);
				}
				break;
			default:
				fsd_exc_raise_fmt(
						FSD_ERRNO_INVALID_ARGUMENT,
						"job::control: unknown action %d", action );
		 }
					
		fsd_log_debug(("job::control: successful"));
	 }
	FINALLY
	 {
		fsd_mutex_unlock( &self->session->drm_connection_mutex );
	 }
	END_TRY

	fsd_log_return(( "" ));
}
Пример #27
0
void
fsd_drmaa_session_wait_for_single_job(
		fsd_drmaa_session_t *self,
		const char *job_id, const struct timespec *timeout,
		int *status, fsd_iter_t **rusage,
		bool dispose
		)
{
	fsd_job_t *volatile job = NULL;
	volatile bool locked = false;

	fsd_log_enter(( "(%s)", job_id ));
	TRY
	 {
		job = self->get_job( self, job_id );
		if( job == NULL )
			fsd_exc_raise_fmt( FSD_DRMAA_ERRNO_INVALID_JOB,
					"Job '%s' not found in DRMS queue", job_id );
		job->update_status( job );
		while( !self->destroy_requested  &&  job->state < DRMAA_PS_DONE )
		 {
			bool signaled = true;
			fsd_log_debug(( "fsd_drmaa_session_wait_for_single_job: "
						"waiting for %s to terminate", job_id ));
			if( self->enable_wait_thread )
			 {
				if( timeout )
					signaled = fsd_cond_timedwait(
							&job->status_cond, &job->mutex, timeout );
				else
				 {
					fsd_cond_wait( &job->status_cond, &job->mutex );
				 }
				if( !signaled )
					fsd_exc_raise_code( FSD_DRMAA_ERRNO_EXIT_TIMEOUT );
			 }
			else
			 {
				self->wait_for_job_status_change(
						self, &job->status_cond, &job->mutex, timeout );
			 }

			fsd_log_debug(( "fsd_drmaa_session_wait_for_single_job: woken up" ));
			if( !self->enable_wait_thread )
				job->update_status( job );
		 }

		if( self->destroy_requested )
			fsd_exc_raise_code( FSD_DRMAA_ERRNO_EXIT_TIMEOUT );

		job->get_termination_status( job, status, rusage );
		if( dispose )
		 {
			job->release( job ); /*release mutex in order to ensure proper order of locking: first job_set mutex then job mutex */

			locked = fsd_mutex_lock( &self->mutex );

			job = self->get_job( self, job_id );
			if (job != NULL)
			 {
				self->jobs->remove( self->jobs, job );
				job->flags |= FSD_JOB_DISPOSED;
			 }
			else
			 {
				fsd_log_error(("Some other thread has already reaped job %s", job_id ));
			 }

			locked = fsd_mutex_unlock( &self->mutex );
		 }
	 }
	FINALLY
	 {
		if ( job )
			job->release( job );
		if ( locked )
			fsd_mutex_unlock( &self->mutex );
	 }
	END_TRY
	fsd_log_return((""));
}