fsd_job_t * fsd_job_set_get( fsd_job_set_t *self, const char *job_id ) { uint32_t h; fsd_job_t *job = NULL; fsd_log_enter(( "(job_id=%s)", job_id )); fsd_mutex_lock( &self->mutex ); h = hashstr( job_id, strlen(job_id), 0 ); h &= self->tab_mask; for( job = self->tab[ h ]; job; job = job->next ) if( !strcmp( job->job_id, job_id ) ) break; if( job ) { fsd_mutex_lock( &job->mutex ); fsd_assert( !(job->flags & FSD_JOB_DISPOSED) ); job->ref_cnt ++; } fsd_mutex_unlock( &self->mutex ); if( job ) fsd_log_return(( "(job_id=%s) =%p: ref_cnt=%d [lock %s]", job_id, (void*)job, job->ref_cnt, job->job_id )); else fsd_log_return(( "(job_id=%s) =NULL", job_id )); return job; }
fsd_job_t * fsd_job_set_find_terminated( fsd_job_set_t *self ) { fsd_job_t *job = NULL; size_t i; fsd_mutex_t* volatile mutex = & self->mutex; fsd_log_enter(( "()" )); fsd_mutex_lock( mutex ); TRY { for( i = 0; i < self->tab_size; i++ ) for( job = self->tab[ i ]; job; job = job->next ) if( job->state >= DRMAA_PS_DONE ) goto found; found: if( job ) { fsd_mutex_lock( &job->mutex ); fsd_assert( !(job->flags & FSD_JOB_DISPOSED) ); job->ref_cnt ++; } } FINALLY { fsd_mutex_unlock( mutex ); } END_TRY if( job ) fsd_log_return(( "() =%p: job_id=%s, ref_cnt=%d [lock %s]", (void*)job, job->job_id, job->ref_cnt, job->job_id )); else fsd_log_return(( "() =%p", (void*)job )); return job; }
void fsd_job_set_remove( fsd_job_set_t *self, fsd_job_t *job ) { fsd_job_t **pjob = NULL; uint32_t h; fsd_log_enter(( "(job_id=%s)", job->job_id )); fsd_mutex_lock( &self->mutex ); TRY { h = hashstr( job->job_id, strlen(job->job_id), 0 ); h &= self->tab_mask; for( pjob = &self->tab[ h ]; *pjob; pjob = &(*pjob)->next ) { if( *pjob == job ) break; } if( *pjob ) { *pjob = (*pjob)->next; job->next = NULL; self->n_jobs--; job->ref_cnt--; } else fsd_exc_raise_code( FSD_DRMAA_ERRNO_INVALID_JOB ); } FINALLY { fsd_mutex_unlock( &self->mutex ); } END_TRY fsd_log_return(( ": job->ref_cnt=%d", job->ref_cnt )); }
void fsd_job_set_signal_all( fsd_job_set_t *self ) { fsd_job_t *volatile job = NULL; fsd_mutex_t *volatile mutex = & self->mutex; fsd_log_enter(( "" )); fsd_mutex_lock( mutex ); TRY { volatile size_t i; for( i = 0; i < self->tab_size; i++ ) for( job = self->tab[ i ]; job; job = job->next ) { fsd_mutex_lock( &job->mutex ); TRY{ fsd_cond_broadcast( &job->status_cond ); } FINALLY{ fsd_mutex_unlock( &job->mutex ); } END_TRY } } FINALLY { fsd_mutex_unlock( mutex ); } END_TRY fsd_log_return(( "" )); }
static void slurmdrmaa_job_on_missing( fsd_job_t *self ) { fsd_log_enter(( "({job_id=%s})", self->job_id )); fsd_log_warning(( "Job %s missing from DRM queue", self->job_id )); fsd_log_info(( "job_on_missing: last job_ps: %s (0x%02x)", drmaa_job_ps_to_str(self->state), self->state)); if( self->state >= DRMAA_PS_RUNNING ) { /*if the job ever entered running state assume finished */ self->state = DRMAA_PS_DONE; self->exit_status = 0; } else { self->state = DRMAA_PS_FAILED; /* otherwise failed */ self->exit_status = -1; } fsd_log_info(("job_on_missing evaluation result: state=%d exit_status=%d", self->state, self->exit_status)); fsd_cond_broadcast( &self->status_cond); fsd_cond_broadcast( &self->session->wait_condition ); fsd_log_return(( "; job_ps=%s, exit_status=%d", drmaa_job_ps_to_str(self->state), self->exit_status )); }
void fsd_drmaa_session_wait_for_job_status_change( fsd_drmaa_session_t *self, fsd_cond_t *wait_condition, fsd_mutex_t *mutex, const struct timespec *timeout ) { struct timespec ts, *next_check = &ts; bool status_changed; if( timeout ) fsd_log_enter(( "(timeout=%ld.%09ld)", timeout->tv_sec, timeout->tv_nsec )); else fsd_log_enter(( "(timeout=(null))" )); fsd_get_time( next_check ); fsd_ts_add( next_check, &self->pool_delay ); if( timeout && fsd_ts_cmp( timeout, next_check ) < 0 ) next_check = (struct timespec*)timeout; fsd_log_debug(( "wait_for_job_status_change: waiting untill %ld.%09ld", next_check->tv_sec, next_check->tv_nsec )); status_changed = fsd_cond_timedwait( wait_condition, mutex, next_check ); if( !status_changed && next_check == timeout ) fsd_exc_raise_code( FSD_DRMAA_ERRNO_EXIT_TIMEOUT ); fsd_log_return(( ": next_check=%ld.%09ld, status_changed=%d", next_check->tv_sec, next_check->tv_nsec, (int)status_changed )); }
static void lsfdrmaa_job_control( fsd_job_t *self, int action ) { /* * XXX: waiting for job state change was removed * since it is not required for drmaa_control * to return after change completes. */ lsfdrmaa_job_t *lsf_self = (lsfdrmaa_job_t*)self; LS_LONG_INT job_id; int signal; fsd_log_enter(( "({job_id=%s}, action=%d)", self->job_id, action )); job_id = lsf_self->int_job_id; switch( action ) { case DRMAA_CONTROL_SUSPEND: case DRMAA_CONTROL_HOLD: signal = SIGSTOP; break; case DRMAA_CONTROL_RESUME: case DRMAA_CONTROL_RELEASE: signal = SIGCONT; break; case DRMAA_CONTROL_TERMINATE: /* TODO: sending SIGTERM (configurable)? */ signal = SIGKILL; break; default: fsd_exc_raise_fmt( FSD_ERRNO_INVALID_ARGUMENT, "job::control: unknown action %d", action ); } fsd_mutex_lock( &self->session->drm_connection_mutex ); TRY { int rc = lsb_signaljob( lsf_self->int_job_id, signal ); fsd_log_debug(( "lsb_signaljob( %d[%d], %d ) = %d", LSB_ARRAY_JOBID(lsf_self->int_job_id), LSB_ARRAY_IDX(lsf_self->int_job_id), signal, rc )); if( rc < 0 ) fsd_exc_raise_fmt( FSD_ERRNO_INTERNAL_ERROR, "job::control: could not send %s to job %s", fsd_strsignal( signal ), self->job_id ); } FINALLY { fsd_mutex_unlock( &self->session->drm_connection_mutex ); } END_TRY fsd_log_return(( "" )); }
static void lsfdrmaa_job_update_status( fsd_job_t *self ) { lsfdrmaa_job_t *lsf_self = (lsfdrmaa_job_t*)self; struct jobInfoEnt *volatile job_info = NULL; bool job_in_queue; fsd_log_enter(( "({job_id=%s, time_delta=%d})", self->job_id, time(NULL) - self->submit_time )); do { fsd_mutex_lock( &self->session->drm_connection_mutex ); TRY { int n_records; int more; char * username = (lsf_self->int_job_id>0)?"all":NULL; fsd_log_debug(( "drm connection locked" )); n_records = lsb_openjobinfo( lsf_self->int_job_id, NULL, username, NULL, NULL, ALL_JOB ); fsd_log_debug(( "lsb_openjobinfo( %d[%d], NULL, %s, NULL, NULL, ALL_JOB ) =%d", LSB_ARRAY_JOBID(lsf_self->int_job_id), LSB_ARRAY_IDX(lsf_self->int_job_id), username?username:"******", n_records )); job_in_queue = n_records > 0; if(!job_in_queue){ if(!(self->flags & FSD_JOB_CURRENT_SESSION)){ fsd_exc_raise_code( FSD_DRMAA_ERRNO_INVALID_JOB ); }else{/*handling missing job*/ self->on_missing(self); } }else{ job_info = lsb_readjobinfo( &more ); fsd_log_debug(( "lsb_readjobinfo(...) =%p: more=%d", (void*)job_info, more )); if( job_info == NULL ) fsd_exc_raise_lsf( "lsb_readjobinfo" ); lsf_self->read_job_info( self, job_info ); } } FINALLY { /* lsfdrmaa_free_job_info( job_info ); */ lsb_closejobinfo(); fsd_log_debug(( "lsb_closejobinfo()" )); fsd_mutex_unlock( &self->session->drm_connection_mutex ); } END_TRY } while( !job_in_queue ); fsd_log_return(( "" )); }
void fsd_job_destroy( fsd_job_t *self ) { fsd_log_enter(( "(%p={job_id=%s})", (void*)self, self->job_id )); fsd_cond_destroy( &self->status_cond ); fsd_cond_destroy( &self->destroy_cond ); fsd_mutex_destroy( &self->mutex ); fsd_free( self->job_id ); fsd_free( self ); fsd_log_return(( "" )); }
void fsd_job_release( fsd_job_t *self ) { bool destroy; fsd_log_enter(( "(%p={job_id=%s, ref_cnt=%d}) [unlock %s]", (void*)self, self->job_id, self->ref_cnt, self->job_id )); fsd_assert( self->ref_cnt > 0 ); destroy = ( --(self->ref_cnt) == 0 ); fsd_mutex_unlock( &self->mutex ); if( destroy ) self->destroy( self ); fsd_log_return(( "" )); }
void fsd_job_destroy( fsd_job_t *self ) { fsd_log_enter(( "(%p={job_id=%s})", (void*)self, self->job_id )); fsd_cond_destroy( &self->status_cond ); fsd_cond_destroy( &self->destroy_cond ); fsd_mutex_destroy( &self->mutex ); fsd_free( self->job_id ); fsd_free( self->execution_hosts ); fsd_free( self->queue ); fsd_free( self->project ); fsd_free( self ); fsd_log_return(( "" )); }
fsd_job_t * fsd_job_new( char *job_id ) { fsd_job_t *volatile self = NULL; fsd_log_enter(( "(%s)", job_id )); TRY { fsd_malloc( self, fsd_job_t ); self->release = fsd_job_release; self->destroy = fsd_job_destroy; self->control = fsd_job_control; self->update_status = fsd_job_update_status; self->get_termination_status = fsd_job_get_termination_status; self->on_missing = fsd_job_on_missing; self->next = NULL; self->ref_cnt = 1; self->job_id = job_id; self->session = NULL; self->last_update_time = 0; self->flags = 0; self->state = DRMAA_PS_UNDETERMINED; self->exit_status = 0; self->submit_time = 0; self->start_time = 0; self->end_time = 0; self->cpu_usage = 0; self->mem_usage = 0; self->vmem_usage = 0; self->walltime = 0; self->n_execution_hosts = 0; self->execution_hosts = NULL; self->retry_cnt = 0; fsd_mutex_init( &self->mutex ); fsd_cond_init( &self->status_cond ); fsd_cond_init( &self->destroy_cond ); fsd_mutex_lock( &self->mutex ); } EXCEPT_DEFAULT { if( self ) self->destroy( self ); else fsd_free( job_id ); fsd_exc_reraise(); } END_TRY fsd_log_return(( "=%p: ref_cnt=%d [lock %s]", (void*)self, self->ref_cnt, self->job_id )); return self; }
void fsd_job_set_add( fsd_job_set_t *self, fsd_job_t *job ) { uint32_t h; fsd_log_enter(( "(job=%p, job_id=%s)", (void*)job, job->job_id )); fsd_mutex_lock( &self->mutex ); h = hashstr( job->job_id, strlen(job->job_id), 0 ); h &= self->tab_mask; job->next = self->tab[ h ]; self->tab[ h ] = job; self->n_jobs++; job->ref_cnt++; fsd_mutex_unlock( &self->mutex ); fsd_log_return(( ": job->ref_cnt=%d", job->ref_cnt )); }
void fsd_drmaa_session_destroy( fsd_drmaa_session_t *self ) { bool already_destroying = false; fsd_log_enter(( "" )); fsd_mutex_lock( &self->mutex ); TRY { if( self->destroy_requested ) already_destroying = true; else { self->destroy_requested = true; fsd_cond_broadcast( &self->wait_condition ); } } FINALLY { fsd_mutex_unlock( &self->mutex ); } END_TRY if( already_destroying ) { /* XXX: actually it can not happen in current implementation when using DRMAA API */ self->release( self ); fsd_exc_raise_code( FSD_DRMAA_ERRNO_NO_ACTIVE_SESSION ); } self->jobs->signal_all( self->jobs ); fsd_mutex_lock( &self->mutex ); TRY { while( self->ref_cnt > 1 ) fsd_cond_wait( &self->destroy_condition, &self->mutex ); fsd_log_debug(("started = %d run_flag = %d", self->wait_thread_started, self->wait_thread_run_flag )); if( self->wait_thread_started ) self->stop_wait_thread( self ); } FINALLY { fsd_mutex_unlock( &self->mutex ); } END_TRY self->destroy_nowait( self ); fsd_log_return(( "" )); }
void fsd_drmaa_session_destroy_nowait( fsd_drmaa_session_t *self ) { fsd_log_enter(( "" )); fsd_conf_dict_destroy( self->configuration ); fsd_free( self->contact ); if( self->jobs ) self->jobs->destroy( self->jobs ); fsd_mutex_destroy( &self->mutex ); fsd_cond_destroy( &self->wait_condition ); fsd_cond_destroy( &self->destroy_condition ); fsd_mutex_destroy( &self->drm_connection_mutex ); fsd_free( self ); fsd_log_return(( "" )); }
void fsd_job_set_destroy( fsd_job_set_t *self ) { unsigned i; fsd_job_t *j; fsd_log_enter(( "()" )); for( i = 0; i < self->tab_size; i++ ) for( j = self->tab[i]; j != NULL; ) { fsd_job_t *job = j; j = j->next; fsd_mutex_lock( &job->mutex ); job->release( job ); } fsd_free( self->tab ); fsd_free( self ); fsd_log_return(( "" )); }
fsd_job_set_t * fsd_job_set_new(void) { fsd_job_set_t *volatile self = NULL; const size_t initial_size = 1024; fsd_log_enter(( "()" )); TRY { fsd_malloc( self, fsd_job_set_t ); self->destroy = fsd_job_set_destroy; self->add = fsd_job_set_add; self->remove = fsd_job_set_remove; self->get = fsd_job_set_get; self->empty = fsd_job_set_empty; self->find_terminated = fsd_job_set_find_terminated; self->get_all_job_ids = fsd_job_set_get_all_job_ids; self->signal_all = fsd_job_set_signal_all; self->tab = NULL; self->n_jobs = 0; fsd_calloc( self->tab, initial_size, fsd_job_t* ); self->tab_size = initial_size; self->tab_mask = self->tab_size - 1; fsd_mutex_init( &self->mutex ); } EXCEPT_DEFAULT { if( self ) { fsd_free( self->tab ); fsd_free( self ); } fsd_exc_reraise(); } END_TRY fsd_log_return(( " =%p", (void*)self )); return self; }
void * fsd_drmaa_session_wait_thread( fsd_drmaa_session_t *self ) { struct timespec ts, *next_check = &ts; bool volatile locked = false; fsd_log_enter(( "" )); locked = fsd_mutex_lock( &self->mutex ); TRY { while( self->wait_thread_run_flag ) TRY { fsd_log_debug(( "wait thread: next iteration" )); self->update_all_jobs_status( self ); fsd_cond_broadcast( &self->wait_condition ); fsd_get_time( next_check ); fsd_ts_add( next_check, &self->pool_delay ); fsd_cond_timedwait( &self->wait_condition, &self->mutex, next_check ); } EXCEPT_DEFAULT { const fsd_exc_t *e = fsd_exc_get(); fsd_log_error(( "wait thread: <%d:%s>", e->code(e), e->message(e) )); } END_TRY } FINALLY { if (locked) fsd_mutex_unlock( &self->mutex ); } END_TRY fsd_log_return(( " =NULL" )); return NULL; }
static void lsfdrmaa_job_read_job_info( fsd_job_t *self, struct jobInfoEnt *job_info ) { int status, flags; fsd_log_enter(( "" )); { int i; fsd_log_debug(( "job status of %s updated from %d[%d]", self->job_id, LSB_ARRAY_JOBID(job_info->jobId), LSB_ARRAY_IDX(job_info->jobId) )); fsd_log_debug(( "\n status: 0x%x", job_info->status )); fsd_log_debug(( "\n submitTime: %ld", job_info->submitTime )); fsd_log_debug(( "\n startTime: %ld", job_info->startTime )); fsd_log_debug(( "\n endTime: %ld", job_info->startTime )); fsd_log_debug(( "\n duration: %d", job_info->duration )); fsd_log_debug(( "\n cpuTime: %f", job_info->cpuTime )); fsd_log_debug(( "\n cwd: %s", job_info->cwd )); fsd_log_debug(( "\n fromHost: %s", job_info->fromHost )); fsd_log_debug(( "\n numExHosts: %d", job_info->numExHosts )); for( i = 0; i < job_info->numExHosts; i++ ) fsd_log_debug(( "\n exHosts[%d]: %s", i, job_info->exHosts[i] )); fsd_log_debug(( "\n exitStatus: %d", job_info->exitStatus )); fsd_log_debug(( "\n execCwd: %s", job_info->execCwd )); fsd_log_debug(( "\n runRusage.mem: %d", job_info->runRusage.mem )); fsd_log_debug(( "\n runRusage.swap: %d", job_info->runRusage.swap )); fsd_log_debug(( "\n runRusage.utime: %d", job_info->runRusage.utime )); fsd_log_debug(( "\n runRusage.stime: %d", job_info->runRusage.stime )); fsd_log_debug(( "\n jName: %s", job_info->jName )); /* fsd_log_debug(( "\n execRusage: %s", job_info->execRusage )); */ } status = job_info->status; flags = 0; if( status & (JOB_STAT_PEND | JOB_STAT_PSUSP) ) flags |= FSD_JOB_QUEUED; if( status & JOB_STAT_PSUSP ) flags |= FSD_JOB_HOLD; if( status & (JOB_STAT_RUN | JOB_STAT_USUSP | JOB_STAT_SSUSP) ) flags |= FSD_JOB_RUNNING; if( status & (JOB_STAT_USUSP | JOB_STAT_SSUSP) ) flags |= FSD_JOB_SUSPENDED; if( status & (JOB_STAT_DONE | JOB_STAT_EXIT) ) flags |= FSD_JOB_TERMINATED; if( status & (JOB_STAT_EXIT | JOB_STAT_PERR) ) flags |= FSD_JOB_ABORTED; self->flags &= ~(FSD_JOB_STATE_MASK | FSD_JOB_ABORTED); self->flags |= flags; if( status & (JOB_STAT_WAIT | JOB_STAT_PEND) ) self->state = DRMAA_PS_QUEUED_ACTIVE; else if( status & JOB_STAT_PSUSP ) self->state = DRMAA_PS_USER_ON_HOLD; else if( status & JOB_STAT_RUN ) self->state = DRMAA_PS_RUNNING; else if( status & JOB_STAT_SSUSP ) self->state = DRMAA_PS_SYSTEM_SUSPENDED; else if( status & JOB_STAT_USUSP ) self->state = DRMAA_PS_USER_SUSPENDED; else if( status & JOB_STAT_DONE ) self->state = DRMAA_PS_DONE; else if( status & JOB_STAT_EXIT ) self->state = DRMAA_PS_FAILED; else if( status & JOB_STAT_PDONE ) self->state = DRMAA_PS_DONE; else if( status & JOB_STAT_PERR ) self->state = DRMAA_PS_FAILED; else if( status & JOB_STAT_UNKWN ) self->state = DRMAA_PS_UNDETERMINED; else self->state = DRMAA_PS_FAILED; self->exit_status = job_info->exitStatus & ~0xff; if( (self->exit_status >> 8) == 0 && (job_info->status & JOB_STAT_EXIT) ) self->exit_status |= 0x01; self->start_time = job_info->startTime; self->end_time = job_info->endTime; self->cpu_usage = job_info->cpuTime; self->mem_usage = max( self->mem_usage, 1024*job_info->runRusage.mem ); self->vmem_usage = max( self->vmem_usage, 1024*job_info->runRusage.swap ); self->walltime = 60*job_info->duration; self->n_execution_hosts = job_info->numExHosts; if( self->execution_hosts == NULL && job_info->exHosts != NULL ) self->execution_hosts = fsd_explode( (const char*const*)job_info->exHosts, ' ', job_info->numExHosts ); self->last_update_time = time(NULL); if( self->state >= DRMAA_PS_DONE ) fsd_cond_broadcast( &self->status_cond ); fsd_log_return(( "" )); }
void fsd_drmaa_session_wait_for_single_job( fsd_drmaa_session_t *self, const char *job_id, const struct timespec *timeout, int *status, fsd_iter_t **rusage, bool dispose ) { fsd_job_t *volatile job = NULL; volatile bool locked = false; fsd_log_enter(( "(%s)", job_id )); TRY { job = self->get_job( self, job_id ); if( job == NULL ) fsd_exc_raise_fmt( FSD_DRMAA_ERRNO_INVALID_JOB, "Job '%s' not found in DRMS queue", job_id ); job->update_status( job ); while( !self->destroy_requested && job->state < DRMAA_PS_DONE ) { bool signaled = true; fsd_log_debug(( "fsd_drmaa_session_wait_for_single_job: " "waiting for %s to terminate", job_id )); if( self->enable_wait_thread ) { if( timeout ) signaled = fsd_cond_timedwait( &job->status_cond, &job->mutex, timeout ); else { fsd_cond_wait( &job->status_cond, &job->mutex ); } if( !signaled ) fsd_exc_raise_code( FSD_DRMAA_ERRNO_EXIT_TIMEOUT ); } else { self->wait_for_job_status_change( self, &job->status_cond, &job->mutex, timeout ); } fsd_log_debug(( "fsd_drmaa_session_wait_for_single_job: woken up" )); if( !self->enable_wait_thread ) job->update_status( job ); } if( self->destroy_requested ) fsd_exc_raise_code( FSD_DRMAA_ERRNO_EXIT_TIMEOUT ); job->get_termination_status( job, status, rusage ); if( dispose ) { job->release( job ); /*release mutex in order to ensure proper order of locking: first job_set mutex then job mutex */ locked = fsd_mutex_lock( &self->mutex ); job = self->get_job( self, job_id ); if (job != NULL) { self->jobs->remove( self->jobs, job ); job->flags |= FSD_JOB_DISPOSED; } else { fsd_log_error(("Some other thread has already reaped job %s", job_id )); } locked = fsd_mutex_unlock( &self->mutex ); } } FINALLY { if ( job ) job->release( job ); if ( locked ) fsd_mutex_unlock( &self->mutex ); } END_TRY fsd_log_return(("")); }
char * fsd_drmaa_session_wait_for_any_job( fsd_drmaa_session_t *self, const struct timespec *timeout, int *status, fsd_iter_t **rusage, bool dispose ) { fsd_job_set_t *set = self->jobs; fsd_job_t *volatile job = NULL; char *volatile job_id = NULL; volatile bool locked = false; fsd_log_enter(( "" )); TRY { while( job == NULL ) { bool signaled = true; if( self->destroy_requested ) fsd_exc_raise_code( FSD_DRMAA_ERRNO_NO_ACTIVE_SESSION ); if( !self->enable_wait_thread ) self->update_all_jobs_status( self ); locked = fsd_mutex_lock( &self->mutex ); if( set->empty( set ) ) fsd_exc_raise_msg( FSD_DRMAA_ERRNO_INVALID_JOB, "No job found to be waited for" ); if( (job = set->find_terminated( set )) != NULL ) break; if( self->destroy_requested ) fsd_exc_raise_code( FSD_DRMAA_ERRNO_NO_ACTIVE_SESSION ); if( self->enable_wait_thread ) { fsd_log_debug(( "wait_for_any_job: waiting for wait thread" )); if( timeout ) signaled = fsd_cond_timedwait( &self->wait_condition, &self->mutex, timeout ); else fsd_cond_wait( &self->wait_condition, &self->mutex ); } else { fsd_log_debug(( "wait_for_any_job: waiting for next check" )); self->wait_for_job_status_change( self, &self->wait_condition, &self->mutex, timeout ); } locked = fsd_mutex_unlock( &self->mutex ); fsd_log_debug(( "wait_for_any_job: woken up; signaled=%d", signaled )); if( !signaled ) fsd_exc_raise_code( FSD_DRMAA_ERRNO_EXIT_TIMEOUT ); } fsd_log_debug(( "wait_for_any_job: waiting finished" )); job_id = fsd_strdup( job->job_id ); job->get_termination_status( job, status, rusage ); } EXCEPT_DEFAULT { if( job_id ) fsd_free( job_id ); fsd_exc_reraise(); } FINALLY { if( job ) { if( fsd_exc_get() == NULL && dispose ) { set->remove( set, job ); job->flags |= FSD_JOB_DISPOSED; } job->release( job ); } if( locked ) fsd_mutex_unlock( &self->mutex ); } END_TRY fsd_log_return(( " =%s", job_id )); return job_id; }
static void slurmdrmaa_job_update_status( fsd_job_t *self ) { job_info_msg_t *job_info = NULL; slurmdrmaa_job_t * slurm_self = (slurmdrmaa_job_t *) self; fsd_log_enter(( "({job_id=%s})", self->job_id )); fsd_mutex_lock( &self->session->drm_connection_mutex ); TRY { if ( slurm_load_job( &job_info, fsd_atoi(self->job_id), SHOW_ALL) ) { int _slurm_errno = slurm_get_errno(); if (_slurm_errno == ESLURM_INVALID_JOB_ID) { self->on_missing(self); } else { fsd_exc_raise_fmt(FSD_ERRNO_INTERNAL_ERROR,"slurm_load_jobs error: %s,job_id: %s", slurm_strerror(slurm_get_errno()), self->job_id); } } if (job_info) { fsd_log_debug(("state = %d, state_reason = %d", job_info->job_array[0].job_state, job_info->job_array[0].state_reason)); switch(job_info->job_array[0].job_state & JOB_STATE_BASE) { case JOB_PENDING: switch(job_info->job_array[0].state_reason) { #if SLURM_VERSION_NUMBER >= SLURM_VERSION_NUM(2,2,0) case WAIT_HELD_USER: /* job is held by user */ fsd_log_debug(("interpreting as DRMAA_PS_USER_ON_HOLD")); self->state = DRMAA_PS_USER_ON_HOLD; break; #endif case WAIT_HELD: /* job is held by administrator */ fsd_log_debug(("interpreting as DRMAA_PS_SYSTEM_ON_HOLD")); self->state = DRMAA_PS_SYSTEM_ON_HOLD; break; default: fsd_log_debug(("interpreting as DRMAA_PS_QUEUED_ACTIVE")); self->state = DRMAA_PS_QUEUED_ACTIVE; } break; case JOB_RUNNING: fsd_log_debug(("interpreting as DRMAA_PS_RUNNING")); self->state = DRMAA_PS_RUNNING; break; case JOB_SUSPENDED: if(slurm_self->user_suspended == true) { fsd_log_debug(("interpreting as DRMAA_PS_USER_SUSPENDED")); self->state = DRMAA_PS_USER_SUSPENDED; } else { fsd_log_debug(("interpreting as DRMAA_PS_SYSTEM_SUSPENDED")); self->state = DRMAA_PS_SYSTEM_SUSPENDED; } break; case JOB_COMPLETE: fsd_log_debug(("interpreting as DRMAA_PS_DONE")); self->state = DRMAA_PS_DONE; self->exit_status = job_info->job_array[0].exit_code; fsd_log_debug(("exit_status = %d -> %d",self->exit_status, WEXITSTATUS(self->exit_status))); break; case JOB_CANCELLED: fsd_log_debug(("interpreting as DRMAA_PS_FAILED (aborted)")); self->state = DRMAA_PS_FAILED; self->exit_status = -1; case JOB_FAILED: case JOB_TIMEOUT: case JOB_NODE_FAIL: #if SLURM_VERSION_NUMBER >= SLURM_VERSION_NUM(2,3,0) case JOB_PREEMPTED: #endif fsd_log_debug(("interpreting as DRMAA_PS_FAILED")); self->state = DRMAA_PS_FAILED; self->exit_status = job_info->job_array[0].exit_code; fsd_log_debug(("exit_status = %d -> %d",self->exit_status, WEXITSTATUS(self->exit_status))); break; default: /*unknown state */ fsd_log_error(("Unknown job state: %d. Please send bug report: http://apps.man.poznan.pl/trac/slurm-drmaa", job_info->job_array[0].job_state)); } if (job_info->job_array[0].job_state & JOB_STATE_FLAGS & JOB_COMPLETING) { fsd_log_debug(("Epilog completing")); } if (job_info->job_array[0].job_state & JOB_STATE_FLAGS & JOB_CONFIGURING) { fsd_log_debug(("Nodes booting")); } if (self->exit_status == -1) /* input,output,error path failure etc*/ self->state = DRMAA_PS_FAILED; self->last_update_time = time(NULL); if( self->state >= DRMAA_PS_DONE ) { fsd_log_debug(("exit_status = %d, WEXITSTATUS(exit_status) = %d", self->exit_status, WEXITSTATUS(self->exit_status))); fsd_cond_broadcast( &self->status_cond ); } } } FINALLY { if(job_info != NULL) slurm_free_job_info_msg (job_info); fsd_mutex_unlock( &self->session->drm_connection_mutex ); } END_TRY fsd_log_return(( "" )); }
static void slurmdrmaa_job_control( fsd_job_t *self, int action ) { slurmdrmaa_job_t *slurm_self = (slurmdrmaa_job_t*)self; job_desc_msg_t job_desc; fsd_log_enter(( "({job_id=%s}, action=%d)", self->job_id, action )); fsd_mutex_lock( &self->session->drm_connection_mutex ); TRY { switch( action ) { case DRMAA_CONTROL_SUSPEND: if(slurm_suspend(fsd_atoi(self->job_id)) == -1) { fsd_exc_raise_fmt( FSD_ERRNO_INTERNAL_ERROR,"slurm_suspend error: %s,job_id: %s",slurm_strerror(slurm_get_errno()),self->job_id); } slurm_self->user_suspended = true; break; case DRMAA_CONTROL_HOLD: /* change priority to 0*/ slurm_init_job_desc_msg(&job_desc); slurm_self->old_priority = job_desc.priority; job_desc.job_id = atoi(self->job_id); job_desc.priority = 0; job_desc.alloc_sid = 0; if(slurm_update_job(&job_desc) == -1) { fsd_exc_raise_fmt( FSD_ERRNO_INTERNAL_ERROR,"slurm_update_job error: %s,job_id: %s",slurm_strerror(slurm_get_errno()),self->job_id); } break; case DRMAA_CONTROL_RESUME: if(slurm_resume(fsd_atoi(self->job_id)) == -1) { fsd_exc_raise_fmt( FSD_ERRNO_INTERNAL_ERROR,"slurm_resume error: %s,job_id: %s",slurm_strerror(slurm_get_errno()),self->job_id); } slurm_self->user_suspended = false; break; case DRMAA_CONTROL_RELEASE: /* change priority back*/ slurm_init_job_desc_msg(&job_desc); job_desc.priority = INFINITE; job_desc.job_id = atoi(self->job_id); if(slurm_update_job(&job_desc) == -1) { fsd_exc_raise_fmt( FSD_ERRNO_INTERNAL_ERROR,"slurm_update_job error: %s,job_id: %s",slurm_strerror(slurm_get_errno()),self->job_id); } break; case DRMAA_CONTROL_TERMINATE: if(slurm_kill_job(fsd_atoi(self->job_id),SIGKILL,0) == -1) { fsd_exc_raise_fmt( FSD_ERRNO_INTERNAL_ERROR,"slurm_terminate_job error: %s,job_id: %s",slurm_strerror(slurm_get_errno()),self->job_id); } break; default: fsd_exc_raise_fmt( FSD_ERRNO_INVALID_ARGUMENT, "job::control: unknown action %d", action ); } fsd_log_debug(("job::control: successful")); } FINALLY { fsd_mutex_unlock( &self->session->drm_connection_mutex ); } END_TRY fsd_log_return(( "" )); }