char * lsb_jobid2str (LS_LONG_INT jobId) { static char string[32]; if (LSB_ARRAY_IDX(jobId) == 0) { sprintf(string, "%d", LSB_ARRAY_JOBID(jobId)); } else { sprintf(string, "%d[%d]", LSB_ARRAY_JOBID(jobId), LSB_ARRAY_IDX(jobId)); } return(string); }
static void lsfdrmaa_job_control( fsd_job_t *self, int action ) { /* * XXX: waiting for job state change was removed * since it is not required for drmaa_control * to return after change completes. */ lsfdrmaa_job_t *lsf_self = (lsfdrmaa_job_t*)self; LS_LONG_INT job_id; int signal; fsd_log_enter(( "({job_id=%s}, action=%d)", self->job_id, action )); job_id = lsf_self->int_job_id; switch( action ) { case DRMAA_CONTROL_SUSPEND: case DRMAA_CONTROL_HOLD: signal = SIGSTOP; break; case DRMAA_CONTROL_RESUME: case DRMAA_CONTROL_RELEASE: signal = SIGCONT; break; case DRMAA_CONTROL_TERMINATE: /* TODO: sending SIGTERM (configurable)? */ signal = SIGKILL; break; default: fsd_exc_raise_fmt( FSD_ERRNO_INVALID_ARGUMENT, "job::control: unknown action %d", action ); } fsd_mutex_lock( &self->session->drm_connection_mutex ); TRY { int rc = lsb_signaljob( lsf_self->int_job_id, signal ); fsd_log_debug(( "lsb_signaljob( %d[%d], %d ) = %d", LSB_ARRAY_JOBID(lsf_self->int_job_id), LSB_ARRAY_IDX(lsf_self->int_job_id), signal, rc )); if( rc < 0 ) fsd_exc_raise_fmt( FSD_ERRNO_INTERNAL_ERROR, "job::control: could not send %s to job %s", fsd_strsignal( signal ), self->job_id ); } FINALLY { fsd_mutex_unlock( &self->session->drm_connection_mutex ); } END_TRY fsd_log_return(( "" )); }
static void lsfdrmaa_job_update_status( fsd_job_t *self ) { lsfdrmaa_job_t *lsf_self = (lsfdrmaa_job_t*)self; struct jobInfoEnt *volatile job_info = NULL; bool job_in_queue; fsd_log_enter(( "({job_id=%s, time_delta=%d})", self->job_id, time(NULL) - self->submit_time )); do { fsd_mutex_lock( &self->session->drm_connection_mutex ); TRY { int n_records; int more; char * username = (lsf_self->int_job_id>0)?"all":NULL; fsd_log_debug(( "drm connection locked" )); n_records = lsb_openjobinfo( lsf_self->int_job_id, NULL, username, NULL, NULL, ALL_JOB ); fsd_log_debug(( "lsb_openjobinfo( %d[%d], NULL, %s, NULL, NULL, ALL_JOB ) =%d", LSB_ARRAY_JOBID(lsf_self->int_job_id), LSB_ARRAY_IDX(lsf_self->int_job_id), username?username:"******", n_records )); job_in_queue = n_records > 0; if(!job_in_queue){ if(!(self->flags & FSD_JOB_CURRENT_SESSION)){ fsd_exc_raise_code( FSD_DRMAA_ERRNO_INVALID_JOB ); }else{/*handling missing job*/ self->on_missing(self); } }else{ job_info = lsb_readjobinfo( &more ); fsd_log_debug(( "lsb_readjobinfo(...) =%p: more=%d", (void*)job_info, more )); if( job_info == NULL ) fsd_exc_raise_lsf( "lsb_readjobinfo" ); lsf_self->read_job_info( self, job_info ); } } FINALLY { /* lsfdrmaa_free_job_info( job_info ); */ lsb_closejobinfo(); fsd_log_debug(( "lsb_closejobinfo()" )); fsd_mutex_unlock( &self->session->drm_connection_mutex ); } END_TRY } while( !job_in_queue ); fsd_log_return(( "" )); }
static void lsfdrmaa_job_read_job_info( fsd_job_t *self, struct jobInfoEnt *job_info ) { int status, flags; fsd_log_enter(( "" )); { int i; fsd_log_debug(( "job status of %s updated from %d[%d]", self->job_id, LSB_ARRAY_JOBID(job_info->jobId), LSB_ARRAY_IDX(job_info->jobId) )); fsd_log_debug(( "\n status: 0x%x", job_info->status )); fsd_log_debug(( "\n submitTime: %ld", job_info->submitTime )); fsd_log_debug(( "\n startTime: %ld", job_info->startTime )); fsd_log_debug(( "\n endTime: %ld", job_info->startTime )); fsd_log_debug(( "\n duration: %d", job_info->duration )); fsd_log_debug(( "\n cpuTime: %f", job_info->cpuTime )); fsd_log_debug(( "\n cwd: %s", job_info->cwd )); fsd_log_debug(( "\n fromHost: %s", job_info->fromHost )); fsd_log_debug(( "\n numExHosts: %d", job_info->numExHosts )); for( i = 0; i < job_info->numExHosts; i++ ) fsd_log_debug(( "\n exHosts[%d]: %s", i, job_info->exHosts[i] )); fsd_log_debug(( "\n exitStatus: %d", job_info->exitStatus )); fsd_log_debug(( "\n execCwd: %s", job_info->execCwd )); fsd_log_debug(( "\n runRusage.mem: %d", job_info->runRusage.mem )); fsd_log_debug(( "\n runRusage.swap: %d", job_info->runRusage.swap )); fsd_log_debug(( "\n runRusage.utime: %d", job_info->runRusage.utime )); fsd_log_debug(( "\n runRusage.stime: %d", job_info->runRusage.stime )); fsd_log_debug(( "\n jName: %s", job_info->jName )); /* fsd_log_debug(( "\n execRusage: %s", job_info->execRusage )); */ } status = job_info->status; flags = 0; if( status & (JOB_STAT_PEND | JOB_STAT_PSUSP) ) flags |= FSD_JOB_QUEUED; if( status & JOB_STAT_PSUSP ) flags |= FSD_JOB_HOLD; if( status & (JOB_STAT_RUN | JOB_STAT_USUSP | JOB_STAT_SSUSP) ) flags |= FSD_JOB_RUNNING; if( status & (JOB_STAT_USUSP | JOB_STAT_SSUSP) ) flags |= FSD_JOB_SUSPENDED; if( status & (JOB_STAT_DONE | JOB_STAT_EXIT) ) flags |= FSD_JOB_TERMINATED; if( status & (JOB_STAT_EXIT | JOB_STAT_PERR) ) flags |= FSD_JOB_ABORTED; self->flags &= ~(FSD_JOB_STATE_MASK | FSD_JOB_ABORTED); self->flags |= flags; if( status & (JOB_STAT_WAIT | JOB_STAT_PEND) ) self->state = DRMAA_PS_QUEUED_ACTIVE; else if( status & JOB_STAT_PSUSP ) self->state = DRMAA_PS_USER_ON_HOLD; else if( status & JOB_STAT_RUN ) self->state = DRMAA_PS_RUNNING; else if( status & JOB_STAT_SSUSP ) self->state = DRMAA_PS_SYSTEM_SUSPENDED; else if( status & JOB_STAT_USUSP ) self->state = DRMAA_PS_USER_SUSPENDED; else if( status & JOB_STAT_DONE ) self->state = DRMAA_PS_DONE; else if( status & JOB_STAT_EXIT ) self->state = DRMAA_PS_FAILED; else if( status & JOB_STAT_PDONE ) self->state = DRMAA_PS_DONE; else if( status & JOB_STAT_PERR ) self->state = DRMAA_PS_FAILED; else if( status & JOB_STAT_UNKWN ) self->state = DRMAA_PS_UNDETERMINED; else self->state = DRMAA_PS_FAILED; self->exit_status = job_info->exitStatus & ~0xff; if( (self->exit_status >> 8) == 0 && (job_info->status & JOB_STAT_EXIT) ) self->exit_status |= 0x01; self->start_time = job_info->startTime; self->end_time = job_info->endTime; self->cpu_usage = job_info->cpuTime; self->mem_usage = max( self->mem_usage, 1024*job_info->runRusage.mem ); self->vmem_usage = max( self->vmem_usage, 1024*job_info->runRusage.swap ); self->walltime = 60*job_info->duration; self->n_execution_hosts = job_info->numExHosts; if( self->execution_hosts == NULL && job_info->exHosts != NULL ) self->execution_hosts = fsd_explode( (const char*const*)job_info->exHosts, ' ', job_info->numExHosts ); self->last_update_time = time(NULL); if( self->state >= DRMAA_PS_DONE ) fsd_cond_broadcast( &self->status_cond ); fsd_log_return(( "" )); }