static void slurmdrmaa_job_on_missing( fsd_job_t *self ) { fsd_log_enter(( "({job_id=%s})", self->job_id )); fsd_log_warning(( "Job %s missing from DRM queue", self->job_id )); fsd_log_info(( "job_on_missing: last job_ps: %s (0x%02x)", drmaa_job_ps_to_str(self->state), self->state)); if( self->state >= DRMAA_PS_RUNNING ) { /*if the job ever entered running state assume finished */ self->state = DRMAA_PS_DONE; self->exit_status = 0; } else { self->state = DRMAA_PS_FAILED; /* otherwise failed */ self->exit_status = -1; } fsd_log_info(("job_on_missing evaluation result: state=%d exit_status=%d", self->state, self->exit_status)); fsd_cond_broadcast( &self->status_cond); fsd_cond_broadcast( &self->session->wait_condition ); fsd_log_return(( "; job_ps=%s, exit_status=%d", drmaa_job_ps_to_str(self->state), self->exit_status )); }
static void lsfdrmaa_job_on_missing(fsd_job_t *self) { const bool lately_submitted = (self->flags & FSD_JOB_CURRENT_SESSION) && self->state == DRMAA_PS_UNDETERMINED && (time(NULL) - self->submit_time < LSFDRMAA_MAX_QUEUE_SKEW); if(lately_submitted){ /*the job was submitted just now, wait a moment to check DRM queue*/ return ; } if(self->session->query_retries > 0 && self->retry_cnt < self->session->query_retries){ self->retry_cnt++; fsd_log_warning(( "job %s missing from DRM queue", self->job_id )); return ; } /*start a monitor task to collect information for missing job*/ if(self->session->monitor_task[0] != '\0'){ self->session->start_monitor_task(self->session, self->job_id); } fsd_log_warning(( "terminate the missing job %s", self->job_id )); self->state = DRMAA_PS_FAILED; self->flags &= ~FSD_JOB_STATE_MASK; self->flags |= FSD_JOB_ABORTED; self->exit_status = ((self->session->missing_jobs_exit_code)&0xff)<<8; self->end_time = time(NULL); self->last_update_time = time(NULL); fsd_cond_broadcast( &self->status_cond ); }
void fsd_job_set_signal_all( fsd_job_set_t *self ) { fsd_job_t *volatile job = NULL; fsd_mutex_t *volatile mutex = & self->mutex; fsd_log_enter(( "" )); fsd_mutex_lock( mutex ); TRY { volatile size_t i; for( i = 0; i < self->tab_size; i++ ) for( job = self->tab[ i ]; job; job = job->next ) { fsd_mutex_lock( &job->mutex ); TRY{ fsd_cond_broadcast( &job->status_cond ); } FINALLY{ fsd_mutex_unlock( &job->mutex ); } END_TRY } } FINALLY { fsd_mutex_unlock( mutex ); } END_TRY fsd_log_return(( "" )); }
void fsd_drmaa_session_stop_wait_thread( fsd_drmaa_session_t *self ) { volatile int lock_count = 0; fsd_log_enter(( "" )); fsd_mutex_lock( &self->mutex ); TRY { fsd_log_debug(("started = %d run_flag = %d", self->wait_thread_started, self->wait_thread_run_flag )); if( self->wait_thread_started ) { self->wait_thread_run_flag = false; fsd_log_debug(("started = %d run_flag = %d", self->wait_thread_started, self->wait_thread_run_flag )); fsd_cond_broadcast( &self->wait_condition ); TRY { lock_count = fsd_mutex_unlock_times( &self->mutex ); fsd_thread_join( self->wait_thread_handle, NULL ); } FINALLY { int i; for( i = 0; i < lock_count; i++ ) fsd_mutex_lock( &self->mutex ); } END_TRY self->wait_thread_started = false; } }
void fsd_drmaa_session_release( fsd_drmaa_session_t *self ) { fsd_mutex_lock( &self->mutex ); self->ref_cnt--; fsd_assert( self->ref_cnt > 0 ); if( self->ref_cnt == 1 ) fsd_cond_broadcast( &self->destroy_condition ); fsd_mutex_unlock( &self->mutex ); }
void fsd_drmaa_session_destroy( fsd_drmaa_session_t *self ) { bool already_destroying = false; fsd_log_enter(( "" )); fsd_mutex_lock( &self->mutex ); TRY { if( self->destroy_requested ) already_destroying = true; else { self->destroy_requested = true; fsd_cond_broadcast( &self->wait_condition ); } } FINALLY { fsd_mutex_unlock( &self->mutex ); } END_TRY if( already_destroying ) { /* XXX: actually it can not happen in current implementation when using DRMAA API */ self->release( self ); fsd_exc_raise_code( FSD_DRMAA_ERRNO_NO_ACTIVE_SESSION ); } self->jobs->signal_all( self->jobs ); fsd_mutex_lock( &self->mutex ); TRY { while( self->ref_cnt > 1 ) fsd_cond_wait( &self->destroy_condition, &self->mutex ); fsd_log_debug(("started = %d run_flag = %d", self->wait_thread_started, self->wait_thread_run_flag )); if( self->wait_thread_started ) self->stop_wait_thread( self ); } FINALLY { fsd_mutex_unlock( &self->mutex ); } END_TRY self->destroy_nowait( self ); fsd_log_return(( "" )); }
void * fsd_drmaa_session_wait_thread( fsd_drmaa_session_t *self ) { struct timespec ts, *next_check = &ts; bool volatile locked = false; fsd_log_enter(( "" )); locked = fsd_mutex_lock( &self->mutex ); TRY { while( self->wait_thread_run_flag ) TRY { fsd_log_debug(( "wait thread: next iteration" )); self->update_all_jobs_status( self ); fsd_cond_broadcast( &self->wait_condition ); fsd_get_time( next_check ); fsd_ts_add( next_check, &self->pool_delay ); fsd_cond_timedwait( &self->wait_condition, &self->mutex, next_check ); } EXCEPT_DEFAULT { const fsd_exc_t *e = fsd_exc_get(); fsd_log_error(( "wait thread: <%d:%s>", e->code(e), e->message(e) )); } END_TRY } FINALLY { if (locked) fsd_mutex_unlock( &self->mutex ); } END_TRY fsd_log_return(( " =NULL" )); return NULL; }
static void lsfdrmaa_job_read_job_info( fsd_job_t *self, struct jobInfoEnt *job_info ) { int status, flags; fsd_log_enter(( "" )); { int i; fsd_log_debug(( "job status of %s updated from %d[%d]", self->job_id, LSB_ARRAY_JOBID(job_info->jobId), LSB_ARRAY_IDX(job_info->jobId) )); fsd_log_debug(( "\n status: 0x%x", job_info->status )); fsd_log_debug(( "\n submitTime: %ld", job_info->submitTime )); fsd_log_debug(( "\n startTime: %ld", job_info->startTime )); fsd_log_debug(( "\n endTime: %ld", job_info->startTime )); fsd_log_debug(( "\n duration: %d", job_info->duration )); fsd_log_debug(( "\n cpuTime: %f", job_info->cpuTime )); fsd_log_debug(( "\n cwd: %s", job_info->cwd )); fsd_log_debug(( "\n fromHost: %s", job_info->fromHost )); fsd_log_debug(( "\n numExHosts: %d", job_info->numExHosts )); for( i = 0; i < job_info->numExHosts; i++ ) fsd_log_debug(( "\n exHosts[%d]: %s", i, job_info->exHosts[i] )); fsd_log_debug(( "\n exitStatus: %d", job_info->exitStatus )); fsd_log_debug(( "\n execCwd: %s", job_info->execCwd )); fsd_log_debug(( "\n runRusage.mem: %d", job_info->runRusage.mem )); fsd_log_debug(( "\n runRusage.swap: %d", job_info->runRusage.swap )); fsd_log_debug(( "\n runRusage.utime: %d", job_info->runRusage.utime )); fsd_log_debug(( "\n runRusage.stime: %d", job_info->runRusage.stime )); fsd_log_debug(( "\n jName: %s", job_info->jName )); /* fsd_log_debug(( "\n execRusage: %s", job_info->execRusage )); */ } status = job_info->status; flags = 0; if( status & (JOB_STAT_PEND | JOB_STAT_PSUSP) ) flags |= FSD_JOB_QUEUED; if( status & JOB_STAT_PSUSP ) flags |= FSD_JOB_HOLD; if( status & (JOB_STAT_RUN | JOB_STAT_USUSP | JOB_STAT_SSUSP) ) flags |= FSD_JOB_RUNNING; if( status & (JOB_STAT_USUSP | JOB_STAT_SSUSP) ) flags |= FSD_JOB_SUSPENDED; if( status & (JOB_STAT_DONE | JOB_STAT_EXIT) ) flags |= FSD_JOB_TERMINATED; if( status & (JOB_STAT_EXIT | JOB_STAT_PERR) ) flags |= FSD_JOB_ABORTED; self->flags &= ~(FSD_JOB_STATE_MASK | FSD_JOB_ABORTED); self->flags |= flags; if( status & (JOB_STAT_WAIT | JOB_STAT_PEND) ) self->state = DRMAA_PS_QUEUED_ACTIVE; else if( status & JOB_STAT_PSUSP ) self->state = DRMAA_PS_USER_ON_HOLD; else if( status & JOB_STAT_RUN ) self->state = DRMAA_PS_RUNNING; else if( status & JOB_STAT_SSUSP ) self->state = DRMAA_PS_SYSTEM_SUSPENDED; else if( status & JOB_STAT_USUSP ) self->state = DRMAA_PS_USER_SUSPENDED; else if( status & JOB_STAT_DONE ) self->state = DRMAA_PS_DONE; else if( status & JOB_STAT_EXIT ) self->state = DRMAA_PS_FAILED; else if( status & JOB_STAT_PDONE ) self->state = DRMAA_PS_DONE; else if( status & JOB_STAT_PERR ) self->state = DRMAA_PS_FAILED; else if( status & JOB_STAT_UNKWN ) self->state = DRMAA_PS_UNDETERMINED; else self->state = DRMAA_PS_FAILED; self->exit_status = job_info->exitStatus & ~0xff; if( (self->exit_status >> 8) == 0 && (job_info->status & JOB_STAT_EXIT) ) self->exit_status |= 0x01; self->start_time = job_info->startTime; self->end_time = job_info->endTime; self->cpu_usage = job_info->cpuTime; self->mem_usage = max( self->mem_usage, 1024*job_info->runRusage.mem ); self->vmem_usage = max( self->vmem_usage, 1024*job_info->runRusage.swap ); self->walltime = 60*job_info->duration; self->n_execution_hosts = job_info->numExHosts; if( self->execution_hosts == NULL && job_info->exHosts != NULL ) self->execution_hosts = fsd_explode( (const char*const*)job_info->exHosts, ' ', job_info->numExHosts ); self->last_update_time = time(NULL); if( self->state >= DRMAA_PS_DONE ) fsd_cond_broadcast( &self->status_cond ); fsd_log_return(( "" )); }
static void slurmdrmaa_job_update_status( fsd_job_t *self ) { job_info_msg_t *job_info = NULL; slurmdrmaa_job_t * slurm_self = (slurmdrmaa_job_t *) self; fsd_log_enter(( "({job_id=%s})", self->job_id )); fsd_mutex_lock( &self->session->drm_connection_mutex ); TRY { if ( slurm_load_job( &job_info, fsd_atoi(self->job_id), SHOW_ALL) ) { int _slurm_errno = slurm_get_errno(); if (_slurm_errno == ESLURM_INVALID_JOB_ID) { self->on_missing(self); } else { fsd_exc_raise_fmt(FSD_ERRNO_INTERNAL_ERROR,"slurm_load_jobs error: %s,job_id: %s", slurm_strerror(slurm_get_errno()), self->job_id); } } if (job_info) { fsd_log_debug(("state = %d, state_reason = %d", job_info->job_array[0].job_state, job_info->job_array[0].state_reason)); switch(job_info->job_array[0].job_state & JOB_STATE_BASE) { case JOB_PENDING: switch(job_info->job_array[0].state_reason) { #if SLURM_VERSION_NUMBER >= SLURM_VERSION_NUM(2,2,0) case WAIT_HELD_USER: /* job is held by user */ fsd_log_debug(("interpreting as DRMAA_PS_USER_ON_HOLD")); self->state = DRMAA_PS_USER_ON_HOLD; break; #endif case WAIT_HELD: /* job is held by administrator */ fsd_log_debug(("interpreting as DRMAA_PS_SYSTEM_ON_HOLD")); self->state = DRMAA_PS_SYSTEM_ON_HOLD; break; default: fsd_log_debug(("interpreting as DRMAA_PS_QUEUED_ACTIVE")); self->state = DRMAA_PS_QUEUED_ACTIVE; } break; case JOB_RUNNING: fsd_log_debug(("interpreting as DRMAA_PS_RUNNING")); self->state = DRMAA_PS_RUNNING; break; case JOB_SUSPENDED: if(slurm_self->user_suspended == true) { fsd_log_debug(("interpreting as DRMAA_PS_USER_SUSPENDED")); self->state = DRMAA_PS_USER_SUSPENDED; } else { fsd_log_debug(("interpreting as DRMAA_PS_SYSTEM_SUSPENDED")); self->state = DRMAA_PS_SYSTEM_SUSPENDED; } break; case JOB_COMPLETE: fsd_log_debug(("interpreting as DRMAA_PS_DONE")); self->state = DRMAA_PS_DONE; self->exit_status = job_info->job_array[0].exit_code; fsd_log_debug(("exit_status = %d -> %d",self->exit_status, WEXITSTATUS(self->exit_status))); break; case JOB_CANCELLED: fsd_log_debug(("interpreting as DRMAA_PS_FAILED (aborted)")); self->state = DRMAA_PS_FAILED; self->exit_status = -1; case JOB_FAILED: case JOB_TIMEOUT: case JOB_NODE_FAIL: #if SLURM_VERSION_NUMBER >= SLURM_VERSION_NUM(2,3,0) case JOB_PREEMPTED: #endif fsd_log_debug(("interpreting as DRMAA_PS_FAILED")); self->state = DRMAA_PS_FAILED; self->exit_status = job_info->job_array[0].exit_code; fsd_log_debug(("exit_status = %d -> %d",self->exit_status, WEXITSTATUS(self->exit_status))); break; default: /*unknown state */ fsd_log_error(("Unknown job state: %d. Please send bug report: http://apps.man.poznan.pl/trac/slurm-drmaa", job_info->job_array[0].job_state)); } if (job_info->job_array[0].job_state & JOB_STATE_FLAGS & JOB_COMPLETING) { fsd_log_debug(("Epilog completing")); } if (job_info->job_array[0].job_state & JOB_STATE_FLAGS & JOB_CONFIGURING) { fsd_log_debug(("Nodes booting")); } if (self->exit_status == -1) /* input,output,error path failure etc*/ self->state = DRMAA_PS_FAILED; self->last_update_time = time(NULL); if( self->state >= DRMAA_PS_DONE ) { fsd_log_debug(("exit_status = %d, WEXITSTATUS(exit_status) = %d", self->exit_status, WEXITSTATUS(self->exit_status))); fsd_cond_broadcast( &self->status_cond ); } } } FINALLY { if(job_info != NULL) slurm_free_job_info_msg (job_info); fsd_mutex_unlock( &self->session->drm_connection_mutex ); } END_TRY fsd_log_return(( "" )); }