void fsd_drmaa_session_stop_wait_thread( fsd_drmaa_session_t *self ) { volatile int lock_count = 0; fsd_log_enter(( "" )); fsd_mutex_lock( &self->mutex ); TRY { fsd_log_debug(("started = %d run_flag = %d", self->wait_thread_started, self->wait_thread_run_flag )); if( self->wait_thread_started ) { self->wait_thread_run_flag = false; fsd_log_debug(("started = %d run_flag = %d", self->wait_thread_started, self->wait_thread_run_flag )); fsd_cond_broadcast( &self->wait_condition ); TRY { lock_count = fsd_mutex_unlock_times( &self->mutex ); fsd_thread_join( self->wait_thread_handle, NULL ); } FINALLY { int i; for( i = 0; i < lock_count; i++ ) fsd_mutex_lock( &self->mutex ); } END_TRY self->wait_thread_started = false; } }
void fsd_log_stacktrace( int skip, int limit ) { void **ptr_buf = NULL; const char **symbols = NULL; int i, n; if( limit == 0 ) limit = 128; skip++; /* without fsd_log_stacktrace() frame */ n = skip + limit; ptr_buf = (void**)calloc( n, sizeof(void*) ); if( ptr_buf == NULL ) return; n = backtrace( ptr_buf, n ); symbols = (const char**)backtrace_symbols( ptr_buf, n ); if( symbols != NULL ) { fsd_log_debug(( "Stacktrace (most recent call last):" )); for( i = n-skip; i >= 0; i-- ) fsd_log_debug(( "\n %s", symbols[i] )); free( symbols ); } }
static void lsfdrmaa_job_update_status( fsd_job_t *self ) { lsfdrmaa_job_t *lsf_self = (lsfdrmaa_job_t*)self; struct jobInfoEnt *volatile job_info = NULL; bool job_in_queue; fsd_log_enter(( "({job_id=%s, time_delta=%d})", self->job_id, time(NULL) - self->submit_time )); do { fsd_mutex_lock( &self->session->drm_connection_mutex ); TRY { int n_records; int more; char * username = (lsf_self->int_job_id>0)?"all":NULL; fsd_log_debug(( "drm connection locked" )); n_records = lsb_openjobinfo( lsf_self->int_job_id, NULL, username, NULL, NULL, ALL_JOB ); fsd_log_debug(( "lsb_openjobinfo( %d[%d], NULL, %s, NULL, NULL, ALL_JOB ) =%d", LSB_ARRAY_JOBID(lsf_self->int_job_id), LSB_ARRAY_IDX(lsf_self->int_job_id), username?username:"******", n_records )); job_in_queue = n_records > 0; if(!job_in_queue){ if(!(self->flags & FSD_JOB_CURRENT_SESSION)){ fsd_exc_raise_code( FSD_DRMAA_ERRNO_INVALID_JOB ); }else{/*handling missing job*/ self->on_missing(self); } }else{ job_info = lsb_readjobinfo( &more ); fsd_log_debug(( "lsb_readjobinfo(...) =%p: more=%d", (void*)job_info, more )); if( job_info == NULL ) fsd_exc_raise_lsf( "lsb_readjobinfo" ); lsf_self->read_job_info( self, job_info ); } } FINALLY { /* lsfdrmaa_free_job_info( job_info ); */ lsb_closejobinfo(); fsd_log_debug(( "lsb_closejobinfo()" )); fsd_mutex_unlock( &self->session->drm_connection_mutex ); } END_TRY } while( !job_in_queue ); fsd_log_return(( "" )); }
void fsd_drmaa_session_wait_for_job_status_change( fsd_drmaa_session_t *self, fsd_cond_t *wait_condition, fsd_mutex_t *mutex, const struct timespec *timeout ) { struct timespec ts, *next_check = &ts; bool status_changed; if( timeout ) fsd_log_enter(( "(timeout=%ld.%09ld)", timeout->tv_sec, timeout->tv_nsec )); else fsd_log_enter(( "(timeout=(null))" )); fsd_get_time( next_check ); fsd_ts_add( next_check, &self->pool_delay ); if( timeout && fsd_ts_cmp( timeout, next_check ) < 0 ) next_check = (struct timespec*)timeout; fsd_log_debug(( "wait_for_job_status_change: waiting untill %ld.%09ld", next_check->tv_sec, next_check->tv_nsec )); status_changed = fsd_cond_timedwait( wait_condition, mutex, next_check ); if( !status_changed && next_check == timeout ) fsd_exc_raise_code( FSD_DRMAA_ERRNO_EXIT_TIMEOUT ); fsd_log_return(( ": next_check=%ld.%09ld, status_changed=%d", next_check->tv_sec, next_check->tv_nsec, (int)status_changed )); }
static int oardrmaa_wifaborted( int *aborted, int stat, char *error_diagnosis, size_t error_diag_len ) { fsd_log_info(("wifaborted(%d)>>>>", stat)); fsd_log_debug(("wifaborted(%d)", stat)); if ( stat == -1 ) { *aborted = true; } else if ( stat <= 125 ) { *aborted = false; } else if ( stat == 126 || stat == 127 ) { *aborted = true; } else switch( stat & 0x7f ) { case SIGTERM: case SIGKILL: *aborted = true; break; default: *aborted = false; break; } return DRMAA_ERRNO_SUCCESS; }
static void lsfdrmaa_job_control( fsd_job_t *self, int action ) { /* * XXX: waiting for job state change was removed * since it is not required for drmaa_control * to return after change completes. */ lsfdrmaa_job_t *lsf_self = (lsfdrmaa_job_t*)self; LS_LONG_INT job_id; int signal; fsd_log_enter(( "({job_id=%s}, action=%d)", self->job_id, action )); job_id = lsf_self->int_job_id; switch( action ) { case DRMAA_CONTROL_SUSPEND: case DRMAA_CONTROL_HOLD: signal = SIGSTOP; break; case DRMAA_CONTROL_RESUME: case DRMAA_CONTROL_RELEASE: signal = SIGCONT; break; case DRMAA_CONTROL_TERMINATE: /* TODO: sending SIGTERM (configurable)? */ signal = SIGKILL; break; default: fsd_exc_raise_fmt( FSD_ERRNO_INVALID_ARGUMENT, "job::control: unknown action %d", action ); } fsd_mutex_lock( &self->session->drm_connection_mutex ); TRY { int rc = lsb_signaljob( lsf_self->int_job_id, signal ); fsd_log_debug(( "lsb_signaljob( %d[%d], %d ) = %d", LSB_ARRAY_JOBID(lsf_self->int_job_id), LSB_ARRAY_IDX(lsf_self->int_job_id), signal, rc )); if( rc < 0 ) fsd_exc_raise_fmt( FSD_ERRNO_INTERNAL_ERROR, "job::control: could not send %s to job %s", fsd_strsignal( signal ), self->job_id ); } FINALLY { fsd_mutex_unlock( &self->session->drm_connection_mutex ); } END_TRY fsd_log_return(( "" )); }
void fsd_drmaa_session_destroy( fsd_drmaa_session_t *self ) { bool already_destroying = false; fsd_log_enter(( "" )); fsd_mutex_lock( &self->mutex ); TRY { if( self->destroy_requested ) already_destroying = true; else { self->destroy_requested = true; fsd_cond_broadcast( &self->wait_condition ); } } FINALLY { fsd_mutex_unlock( &self->mutex ); } END_TRY if( already_destroying ) { /* XXX: actually it can not happen in current implementation when using DRMAA API */ self->release( self ); fsd_exc_raise_code( FSD_DRMAA_ERRNO_NO_ACTIVE_SESSION ); } self->jobs->signal_all( self->jobs ); fsd_mutex_lock( &self->mutex ); TRY { while( self->ref_cnt > 1 ) fsd_cond_wait( &self->destroy_condition, &self->mutex ); fsd_log_debug(("started = %d run_flag = %d", self->wait_thread_started, self->wait_thread_run_flag )); if( self->wait_thread_started ) self->stop_wait_thread( self ); } FINALLY { fsd_mutex_unlock( &self->mutex ); } END_TRY self->destroy_nowait( self ); fsd_log_return(( "" )); }
void * fsd_drmaa_session_wait_thread( fsd_drmaa_session_t *self ) { struct timespec ts, *next_check = &ts; bool volatile locked = false; fsd_log_enter(( "" )); locked = fsd_mutex_lock( &self->mutex ); TRY { while( self->wait_thread_run_flag ) TRY { fsd_log_debug(( "wait thread: next iteration" )); self->update_all_jobs_status( self ); fsd_cond_broadcast( &self->wait_condition ); fsd_get_time( next_check ); fsd_ts_add( next_check, &self->pool_delay ); fsd_cond_timedwait( &self->wait_condition, &self->mutex, next_check ); } EXCEPT_DEFAULT { const fsd_exc_t *e = fsd_exc_get(); fsd_log_error(( "wait thread: <%d:%s>", e->code(e), e->message(e) )); } END_TRY } FINALLY { if (locked) fsd_mutex_unlock( &self->mutex ); } END_TRY fsd_log_return(( " =NULL" )); return NULL; }
fsd_iter_t * slurmdrmaa_session_run_bulk( fsd_drmaa_session_t *self, const fsd_template_t *jt, int start, int end, int incr ) { int ret = 0; unsigned i = 0; int job_id = 0; int task_id = 0; fsd_job_t *volatile job = NULL; volatile unsigned n_jobs = (end - start) / incr + 1; char ** volatile job_ids = fsd_calloc( job_ids, n_jobs + 1, char* ); volatile bool connection_lock = false; fsd_environ_t *volatile env = NULL; job_desc_msg_t job_desc; submit_response_msg_t *submit_response = NULL; slurmdrmaa_init_job_desc( &job_desc ); TRY { connection_lock = fsd_mutex_lock( &self->drm_connection_mutex ); slurmdrmaa_job_create_req( self, jt, (fsd_environ_t**)&env , &job_desc, 0 ); /* Create job array spec if more than 1 task */ if(n_jobs > 1) { fsd_calloc(job_desc.array_inx, ARRAY_INX_MAXLEN, char*); ret = snprintf(job_desc.array_inx, ARRAY_INX_MAXLEN, "%d-%d:%d", start, end, incr ); if (ret < 0 || ret >= ARRAY_INX_MAXLEN) { fsd_exc_raise_fmt(FSD_ERRNO_INTERNAL_ERROR, "snprintf: not enough memory"); } fsd_log_debug(("array job '%s' prepared", job_desc.array_inx)); } /* Submit the batch job */ if(slurm_submit_batch_job(&job_desc, &submit_response) != SLURM_SUCCESS){ fsd_exc_raise_fmt( FSD_ERRNO_INTERNAL_ERROR,"slurm_submit_batch_job: %s",slurm_strerror(slurm_get_errno())); } connection_lock = fsd_mutex_unlock( &self->drm_connection_mutex ); /* Watch each job in the array */ for (i = 0; i < n_jobs; ++i) { job_id = (int) submit_response->job_id; task_id = start + i*incr; if (n_jobs > 1) { /* Array job */ if (!working_cluster_rec) job_ids[i] = fsd_asprintf("%d_%d", job_id, task_id); /* .0*/ else job_ids[i] = fsd_asprintf("%d_%d.%s", job_id, task_id, working_cluster_rec->name); } else { /* Single job */ if (!working_cluster_rec) job_ids[i] = fsd_asprintf("%d", job_id); /* .0*/ else job_ids[i] = fsd_asprintf("%d.%s", job_id, working_cluster_rec->name); } fsd_log_debug(("job %s submitted", job_ids[i])); job = slurmdrmaa_job_new( fsd_strdup(job_ids[i]) ); job->session = self; job->submit_time = time(NULL); self->jobs->add( self->jobs, job ); job->release( job ); job = NULL; } if (working_cluster_rec) slurmdb_destroy_cluster_rec(working_cluster_rec); working_cluster_rec = NULL; }
void lsfdrmaa_job_set_req( fsd_drmaa_session_t *session, fsd_expand_drmaa_ph_t *expand, const fsd_template_t *jt, struct submit *req, fsd_environ_t **envp ) { const char *input_path_orig = NULL; const char *output_path_orig = NULL; const char *error_path_orig = NULL; char *volatile input_path = NULL; char *volatile output_path = NULL; char *volatile error_path = NULL; bool input_host = false; bool output_host = false; bool error_host = false; bool join_files = false; bool transfer_input = false; bool transfer_output = false; bool transfer_error = false; const char *job_category = "default"; char **volatile argv = NULL; const char *value; const char *const *vector; /* set default lsf configs */ { int i = 0; req->options = 0; req->options2 = 0; for( i = 0; i < LSF_RLIM_NLIMITS; i++ ) req->rLimits[i] = DEFAULT_RLIMIT; req->beginTime = 0; req->termTime = 0; } /* job category */ value = jt->get_attr( jt, DRMAA_JOB_CATEGORY ); if( value ) job_category = value; { fsd_conf_option_t *category_value = NULL; category_value = fsd_conf_dict_get( session->job_categories, job_category ); if( category_value != NULL ) { if( category_value->type != FSD_CONF_STRING ) fsd_exc_raise_fmt( FSD_ERRNO_INTERNAL_ERROR, "configuration error: job category should be string" ); lsfdrmaa_native_parse( category_value->val.string, req ); } else { if( value != NULL ) fsd_exc_raise_fmt( FSD_DRMAA_ERRNO_INVALID_ATTRIBUTE_VALUE, "invalid job category: %s", job_category ); } } /* job working directory */ value = jt->get_attr( jt, DRMAA_WD ); if( value ) { char *cwd = NULL; cwd = expand->expand( expand, fsd_strdup(value), FSD_DRMAA_PH_HD | FSD_DRMAA_PH_INCR ); expand->set( expand, FSD_DRMAA_PH_WD, cwd ); #ifdef SUB3_CWD req->cwd = fsd_strdup( cwd ); req->options3 |= SUB3_CWD; #else fsd_exc_raise_fmt(FSD_ERRNO_INTERNAL_ERROR, "DRMAA_WD attribute is not supported in this version of LSF."); #endif } TRY { const char *command = NULL; unsigned n_args = 0; const char *const *i; int j; /* remote command */ command = jt->get_attr( jt, DRMAA_REMOTE_COMMAND ); if( command == NULL ) fsd_exc_raise_msg( FSD_DRMAA_ERRNO_CONFLICTING_ATTRIBUTE_VALUES, "drmaa_remote_command not set for job template" ); /* arguments list */ vector = jt->get_v_attr( jt, DRMAA_V_ARGV ); if( vector ) { for( i = vector; *i; i++ ) n_args++; } fsd_calloc( argv, n_args+3, char* ); argv[0] = fsd_strdup("exec"); argv[1] = expand->expand( expand, fsd_strdup(command), FSD_DRMAA_PH_HD | FSD_DRMAA_PH_WD ); if( vector ) { for( i = vector, j = 2; *i; i++, j++ ) argv[j] = expand->expand( expand, fsd_strdup(*i), FSD_DRMAA_PH_HD | FSD_DRMAA_PH_WD ); } req->command = lsfdrmaa_job_quote_command( (const char*const*)argv ); } FINALLY { fsd_free_vector( argv ); } END_TRY /* job name */ value = jt->get_attr( jt, DRMAA_JOB_NAME ); if( value ) { req->jobName = fsd_strdup(value); req->options |= SUB_JOB_NAME; } /* job state at submit */ value = jt->get_attr( jt, DRMAA_JS_STATE ); if( value ) { if( 0 == strcmp( value, DRMAA_SUBMISSION_STATE_ACTIVE ) ) req->options2 &= !SUB2_HOLD; else if( 0 == strcmp( value, DRMAA_SUBMISSION_STATE_HOLD ) ) req->options2 |= SUB2_HOLD; else fsd_exc_raise_msg( FSD_DRMAA_ERRNO_INVALID_ATTRIBUTE_VALUE, "invalid value of drmaa_js_state attribute" ); } /* environment */ vector = jt->get_v_attr( jt, DRMAA_V_ENV ); if( vector ) { fsd_environ_t *env; *envp = env = fsd_environ_new( NULL ); env->update( env, vector ); } /* start time */ value = jt->get_attr( jt, DRMAA_START_TIME ); if( value ) { req->beginTime = fsd_datetime_parse( value ); fsd_log_debug(( "\n drmaa_start_time: %s -> %ld", value, (long)req->beginTime )); } TRY { /* input path */ input_path_orig = jt->get_attr( jt, DRMAA_INPUT_PATH ); if( input_path_orig ) { input_path = internal_map_file( expand, input_path_orig, &input_host, "input" ); fsd_log_debug(( "\n drmaa_input_path: %s -> %s", input_path_orig, input_path )); } /* output path */ output_path_orig = jt->get_attr( jt, DRMAA_OUTPUT_PATH ); if( output_path_orig ) { output_path = internal_map_file( expand, output_path_orig, &output_host, "output" ); fsd_log_debug(( "\n drmaa_output_path: %s -> %s", output_path_orig, output_path )); } /* error path */ error_path_orig = jt->get_attr( jt, DRMAA_ERROR_PATH ); if( error_path_orig ) { error_path = internal_map_file( expand, error_path_orig, &error_host, "error" ); fsd_log_debug(( "\n drmaa_error_path: %s -> %s", error_path_orig, error_path )); } /* join files */ value = jt->get_attr( jt, DRMAA_JOIN_FILES ); if( value ) { if( (value[0] == 'y' || value[0] == 'Y') && value[1] == '\0' ) join_files = true; else if( (value[0] == 'n' || value[0] == 'N') && value[1] == '\0' ) join_files = false; else fsd_exc_raise_msg( FSD_DRMAA_ERRNO_INVALID_ATTRIBUTE_VALUE, "invalid value of drmaa_join_files attribute" ); } if( join_files ) { /* * LSF by default joins output and error streams * when error file is not set. */ if( error_path ) { if( output_path == NULL ) fsd_exc_raise_msg( FSD_DRMAA_ERRNO_CONFLICTING_ATTRIBUTE_VALUES, "drmaa_join_files is set and output file is not given" ); if( 0 != strcmp( output_path, error_path ) ) fsd_log_warning(( "Error file was given but will be ignored " "since drmaa_join_files was set." )); fsd_free( error_path ); error_path = NULL; } } else { /* * If error path is not set, we must set it to /dev/null * to prevent joining files. */ if( error_path == NULL && output_path ) error_path = fsd_strdup( "/dev/null" ); if( output_path == NULL && error_path ) output_path = fsd_strdup( "/dev/null" ); if( req->errFile == NULL ) { req->errFile = fsd_strdup( "/dev/null" ); req->options |= SUB_ERR_FILE; #ifdef SUB2_OVERWRITE_ERR_FILE req->options2 &= ~SUB2_OVERWRITE_ERR_FILE; #endif } } /* transfer files */ value = jt->get_attr( jt, DRMAA_TRANSFER_FILES ); if( value ) { const char *i; for( i = value; *i; i++ ) { switch( *i ) { case 'i': transfer_input = true; break; case 'o': transfer_output = true; break; case 'e': transfer_error = true; break; default: fsd_exc_raise_fmt( FSD_DRMAA_ERRNO_INVALID_ATTRIBUTE_VALUE, "invalid character '%c' in drmaa_transfer_files: %s", *i, value ); } } } # if 0 { /* * Input file is transfered by LSF from submission host whenever * it isn't found on execution host regardless of explicit file transfers. * When drmaa_transfer_files contains ``i`` input file is send * explicitly because it may be outdated or otherwise differ. */ static const char *name[3] = {"input", "output", "error"}; static const int options[3] = { XF_OP_SUB2EXEC, XF_OP_EXEC2SUB, XF_OP_EXEC2SUB }; const char *path[3]; bool host[3], transfer[3]; int i; path[i=0] = input_path; path[++i] = output_path; path[++i] = error_path; host[i=0] = input_host; host[++i] = output_host; host[++i] = error_host; transfer[i=0] = transfer_input; transfer[++i] = transfer_output; transfer[++i] = transfer_error; for( i = 0; i < 3; i++ ) { struct xFile *t; if( !(transfer[i] && path[i] != NULL) ) continue; if( 0 == strcmp( path[i], "/dev/null" ) ) continue; if( host[i] ) fsd_log_warning(( "hostname in drmaa_%s_path ignored", name[i] )); fsd_log_debug(( "setting transfer of %s file (%s) " "to execution host", name[i], path[i] )); fsd_realloc( req->xf, req->nxf+1, struct xFile ); t = &req->xf[ req->nxf++ ]; memset( t, 0, sizeof(struct xFile) ); if( sizeof(t->subFn) == MAXFILENAMELEN ) { /* LSF 6 */ strlcpy( t->subFn, path[i], MAXFILENAMELEN ); strlcpy( t->execFn, path[i], MAXFILENAMELEN ); } else { /* LSF 7 */ *(char**)&t->subFn = fsd_strdup( path[i] ); *(char**)&t->execFn = fsd_strdup( path[i] ); } t->options = options[i]; } if( req->nxf > 0 ) req->options |= SUB_OTHER_FILES; } # endif /* transfer files */ /* email addresses to send notifications */ vector = jt->get_v_attr( jt, DRMAA_V_EMAIL ); if( vector && vector[0] ) { /* only to one email address message may be send */ req->mailUser = fsd_strdup( vector[0] ); req->options |= SUB_MAIL_USER | SUB_NOTIFY_END; #if 0 if( vector[1] != NULL ) fsd_log_warning(( "LSF only supports one e-mail " "notification address" )); #endif } /* block email */ value = jt->get_attr( jt, DRMAA_BLOCK_EMAIL ); if( value ) { bool block; if( strcmp(value, "1") == 0 ) block = true; else if( strcmp(value, "0") == 0 ) block = false; else fsd_exc_raise_msg( FSD_DRMAA_ERRNO_INVALID_ATTRIBUTE_VALUE, "invalid value of drmaa_block_email attribute" ); if( block ) { if( output_path == NULL ) { fsd_log_debug(( "output path not set and we want to block e-mail, " "set to /dev/null" )); output_path = fsd_strdup( "/dev/null" ); } req->options &= ~SUB_NOTIFY_END; } else { /* SUB_NOTIFY_END should force sending e-mail even if outfile is set */ req->options |= SUB_NOTIFY_END; } } if( !((lsfdrmaa_session_t*)session)->prepand_report_to_output && (req->options & SUB_NOTIFY_END) == 0 && output_path != NULL ) { req->options |= SUB_MAIL_USER | SUB_NOTIFY_END; fsd_free( req->mailUser ); /* when email was set but notification was blocked */ req->mailUser = fsd_strdup( "notexistent" ); } if( input_path ) { req->inFile = input_path; req->options |= SUB_IN_FILE; input_path = NULL; } if( output_path ) { req->outFile = output_path; req->options |= SUB_OUT_FILE; #ifdef SUB2_OVERWRITE_OUT_FILE if( 0 != strcmp( output_path, "/dev/null" ) ) req->options2 |= SUB2_OVERWRITE_OUT_FILE; #endif output_path = NULL; } if( error_path ) { req->errFile = error_path; req->options |= SUB_ERR_FILE; #ifdef SUB2_OVERWRITE_ERR_FILE if( 0 != strcmp( error_path, "/dev/null" ) ) req->options2 |= SUB2_OVERWRITE_ERR_FILE; #endif error_path = NULL; } } FINALLY { fsd_free( input_path ); fsd_free( output_path ); fsd_free( error_path ); } END_TRY /* deadline time */ value = jt->get_attr( jt, DRMAA_DEADLINE_TIME ); if( value ) req->termTime = fsd_datetime_parse( value ); /* wall clock time hard limit */ value = jt->get_attr( jt, DRMAA_WCT_HLIMIT ); if( value ) req->rLimits[ LSF_RLIMIT_RUN ] = fsd_parse_timedelta( value ); /* wall clock time soft limit */ #ifdef SUB3_RUNTIME_ESTIMATION value = jt->get_attr( jt, DRMAA_WCT_SLIMIT ); if( value ) { req->options3 = SUB3_RUNTIME_ESTIMATION; req->runtimeEstimation = fsd_parse_timedelta( value ); } #endif /* duration hard limit */ value = jt->get_attr( jt, DRMAA_DURATION_HLIMIT ); if( value ) req->rLimits[ LSF_RLIMIT_CPU ] = fsd_parse_timedelta( value ); /* native specification */ value = jt->get_attr( jt, DRMAA_NATIVE_SPECIFICATION ); if( value ) lsfdrmaa_native_parse( value, req ); lsfdrmaa_dump_submit_req(req); }
static void lsfdrmaa_job_read_job_info( fsd_job_t *self, struct jobInfoEnt *job_info ) { int status, flags; fsd_log_enter(( "" )); { int i; fsd_log_debug(( "job status of %s updated from %d[%d]", self->job_id, LSB_ARRAY_JOBID(job_info->jobId), LSB_ARRAY_IDX(job_info->jobId) )); fsd_log_debug(( "\n status: 0x%x", job_info->status )); fsd_log_debug(( "\n submitTime: %ld", job_info->submitTime )); fsd_log_debug(( "\n startTime: %ld", job_info->startTime )); fsd_log_debug(( "\n endTime: %ld", job_info->startTime )); fsd_log_debug(( "\n duration: %d", job_info->duration )); fsd_log_debug(( "\n cpuTime: %f", job_info->cpuTime )); fsd_log_debug(( "\n cwd: %s", job_info->cwd )); fsd_log_debug(( "\n fromHost: %s", job_info->fromHost )); fsd_log_debug(( "\n numExHosts: %d", job_info->numExHosts )); for( i = 0; i < job_info->numExHosts; i++ ) fsd_log_debug(( "\n exHosts[%d]: %s", i, job_info->exHosts[i] )); fsd_log_debug(( "\n exitStatus: %d", job_info->exitStatus )); fsd_log_debug(( "\n execCwd: %s", job_info->execCwd )); fsd_log_debug(( "\n runRusage.mem: %d", job_info->runRusage.mem )); fsd_log_debug(( "\n runRusage.swap: %d", job_info->runRusage.swap )); fsd_log_debug(( "\n runRusage.utime: %d", job_info->runRusage.utime )); fsd_log_debug(( "\n runRusage.stime: %d", job_info->runRusage.stime )); fsd_log_debug(( "\n jName: %s", job_info->jName )); /* fsd_log_debug(( "\n execRusage: %s", job_info->execRusage )); */ } status = job_info->status; flags = 0; if( status & (JOB_STAT_PEND | JOB_STAT_PSUSP) ) flags |= FSD_JOB_QUEUED; if( status & JOB_STAT_PSUSP ) flags |= FSD_JOB_HOLD; if( status & (JOB_STAT_RUN | JOB_STAT_USUSP | JOB_STAT_SSUSP) ) flags |= FSD_JOB_RUNNING; if( status & (JOB_STAT_USUSP | JOB_STAT_SSUSP) ) flags |= FSD_JOB_SUSPENDED; if( status & (JOB_STAT_DONE | JOB_STAT_EXIT) ) flags |= FSD_JOB_TERMINATED; if( status & (JOB_STAT_EXIT | JOB_STAT_PERR) ) flags |= FSD_JOB_ABORTED; self->flags &= ~(FSD_JOB_STATE_MASK | FSD_JOB_ABORTED); self->flags |= flags; if( status & (JOB_STAT_WAIT | JOB_STAT_PEND) ) self->state = DRMAA_PS_QUEUED_ACTIVE; else if( status & JOB_STAT_PSUSP ) self->state = DRMAA_PS_USER_ON_HOLD; else if( status & JOB_STAT_RUN ) self->state = DRMAA_PS_RUNNING; else if( status & JOB_STAT_SSUSP ) self->state = DRMAA_PS_SYSTEM_SUSPENDED; else if( status & JOB_STAT_USUSP ) self->state = DRMAA_PS_USER_SUSPENDED; else if( status & JOB_STAT_DONE ) self->state = DRMAA_PS_DONE; else if( status & JOB_STAT_EXIT ) self->state = DRMAA_PS_FAILED; else if( status & JOB_STAT_PDONE ) self->state = DRMAA_PS_DONE; else if( status & JOB_STAT_PERR ) self->state = DRMAA_PS_FAILED; else if( status & JOB_STAT_UNKWN ) self->state = DRMAA_PS_UNDETERMINED; else self->state = DRMAA_PS_FAILED; self->exit_status = job_info->exitStatus & ~0xff; if( (self->exit_status >> 8) == 0 && (job_info->status & JOB_STAT_EXIT) ) self->exit_status |= 0x01; self->start_time = job_info->startTime; self->end_time = job_info->endTime; self->cpu_usage = job_info->cpuTime; self->mem_usage = max( self->mem_usage, 1024*job_info->runRusage.mem ); self->vmem_usage = max( self->vmem_usage, 1024*job_info->runRusage.swap ); self->walltime = 60*job_info->duration; self->n_execution_hosts = job_info->numExHosts; if( self->execution_hosts == NULL && job_info->exHosts != NULL ) self->execution_hosts = fsd_explode( (const char*const*)job_info->exHosts, ' ', job_info->numExHosts ); self->last_update_time = time(NULL); if( self->state >= DRMAA_PS_DONE ) fsd_cond_broadcast( &self->status_cond ); fsd_log_return(( "" )); }
static void slurmdrmaa_job_control( fsd_job_t *self, int action ) { slurmdrmaa_job_t *slurm_self = (slurmdrmaa_job_t*)self; job_desc_msg_t job_desc; fsd_log_enter(( "({job_id=%s}, action=%d)", self->job_id, action )); fsd_mutex_lock( &self->session->drm_connection_mutex ); TRY { switch( action ) { case DRMAA_CONTROL_SUSPEND: if(slurm_suspend(fsd_atoi(self->job_id)) == -1) { fsd_exc_raise_fmt( FSD_ERRNO_INTERNAL_ERROR,"slurm_suspend error: %s,job_id: %s",slurm_strerror(slurm_get_errno()),self->job_id); } slurm_self->user_suspended = true; break; case DRMAA_CONTROL_HOLD: /* change priority to 0*/ slurm_init_job_desc_msg(&job_desc); slurm_self->old_priority = job_desc.priority; job_desc.job_id = atoi(self->job_id); job_desc.priority = 0; job_desc.alloc_sid = 0; if(slurm_update_job(&job_desc) == -1) { fsd_exc_raise_fmt( FSD_ERRNO_INTERNAL_ERROR,"slurm_update_job error: %s,job_id: %s",slurm_strerror(slurm_get_errno()),self->job_id); } break; case DRMAA_CONTROL_RESUME: if(slurm_resume(fsd_atoi(self->job_id)) == -1) { fsd_exc_raise_fmt( FSD_ERRNO_INTERNAL_ERROR,"slurm_resume error: %s,job_id: %s",slurm_strerror(slurm_get_errno()),self->job_id); } slurm_self->user_suspended = false; break; case DRMAA_CONTROL_RELEASE: /* change priority back*/ slurm_init_job_desc_msg(&job_desc); job_desc.priority = INFINITE; job_desc.job_id = atoi(self->job_id); if(slurm_update_job(&job_desc) == -1) { fsd_exc_raise_fmt( FSD_ERRNO_INTERNAL_ERROR,"slurm_update_job error: %s,job_id: %s",slurm_strerror(slurm_get_errno()),self->job_id); } break; case DRMAA_CONTROL_TERMINATE: if(slurm_kill_job(fsd_atoi(self->job_id),SIGKILL,0) == -1) { fsd_exc_raise_fmt( FSD_ERRNO_INTERNAL_ERROR,"slurm_terminate_job error: %s,job_id: %s",slurm_strerror(slurm_get_errno()),self->job_id); } break; default: fsd_exc_raise_fmt( FSD_ERRNO_INVALID_ARGUMENT, "job::control: unknown action %d", action ); } fsd_log_debug(("job::control: successful")); } FINALLY { fsd_mutex_unlock( &self->session->drm_connection_mutex ); } END_TRY fsd_log_return(( "" )); }
void slurmdrmaa_job_create( fsd_drmaa_session_t *session, const fsd_template_t *jt, fsd_environ_t **envp, fsd_expand_drmaa_ph_t *expand, job_desc_msg_t * job_desc, int n_job ) { const char *input_path_orig = NULL; const char *output_path_orig = NULL; const char *error_path_orig = NULL; char *volatile input_path = NULL; char *volatile output_path = NULL; char *volatile error_path = NULL; bool input_host = false; bool output_host = false; bool error_host = false; bool join_files = false; const char *value; const char *const *vector; const char *job_category = "default"; job_desc->user_id = getuid(); job_desc->group_id = getgid(); job_desc->env_size = 0; /* job name */ value = jt->get_attr( jt, DRMAA_JOB_NAME ); if( value ) { job_desc->name = fsd_strdup(value); fsd_log_debug(("# job_name = %s",job_desc->name)); } /* job state at submit */ value = jt->get_attr( jt, DRMAA_JS_STATE ); if( value ) { if( 0 == strcmp( value, DRMAA_SUBMISSION_STATE_ACTIVE ) ) {} else if( 0 == strcmp( value, DRMAA_SUBMISSION_STATE_HOLD ) ) { job_desc->priority = 0; fsd_log_debug(("# hold = user")); } else { fsd_exc_raise_msg(FSD_DRMAA_ERRNO_INVALID_ATTRIBUTE_VALUE, "invalid value of drmaa_js_state attribute" ); } } TRY { const char *command = NULL; char *command_expanded = NULL; char *temp_script_old = NULL; char *temp_script = ""; const char *const *i; int j; /* remote command */ command = jt->get_attr( jt, DRMAA_REMOTE_COMMAND ); if( command == NULL ) fsd_exc_raise_msg( FSD_DRMAA_ERRNO_CONFLICTING_ATTRIBUTE_VALUES, "drmaa_remote_command not set for job template" ); command_expanded = expand->expand( expand, fsd_strdup(command), FSD_DRMAA_PH_HD | FSD_DRMAA_PH_WD ); temp_script = fsd_asprintf("#!/bin/bash\n%s",command_expanded); fsd_free(command_expanded); /* arguments list */ vector = jt->get_v_attr( jt, DRMAA_V_ARGV ); if( vector ) { for( i = vector, j = 2; *i; i++, j++ ) { char *arg_expanded = expand->expand( expand, fsd_strdup(*i), FSD_DRMAA_PH_HD | FSD_DRMAA_PH_WD ); temp_script_old = fsd_strdup(temp_script); if (strcmp(temp_script, "") != 0) { fsd_free(temp_script); } /* add too script */ temp_script = fsd_asprintf("%s '%s'", temp_script_old, arg_expanded); fsd_free(temp_script_old); fsd_free(arg_expanded); } } job_desc->script = fsd_asprintf("%s\n", temp_script); fsd_log_debug(("# Script:\n%s", job_desc->script)); fsd_free(temp_script); } END_TRY /* start time */ value = jt->get_attr( jt, DRMAA_START_TIME ); if( value ) { job_desc->begin_time = fsd_datetime_parse( value ); fsd_log_debug(( "\n drmaa_start_time: %s -> %ld", value, (long)job_desc->begin_time)); } /* propagate all environment variables from submission host */ { extern char **environ; char **i; unsigned j = 0; for ( i = environ; *i; i++) { job_desc->env_size++; } fsd_log_debug(("environ env_size = %d",job_desc->env_size)); fsd_calloc(job_desc->environment, job_desc->env_size+1, char *); for ( i = environ; *i; i++,j++ ) { job_desc->environment[j] = fsd_strdup(*i); } } /* environment */ vector = jt->get_v_attr( jt, DRMAA_V_ENV ); if( vector ) { const char *const *i; unsigned j = 0; unsigned env_offset = job_desc->env_size; for( i = vector; *i; i++ ) { job_desc->env_size++; } fsd_log_debug(("jt env_size = %d",job_desc->env_size)); fsd_log_debug(("# environment =")); fsd_realloc(job_desc->environment, job_desc->env_size+1, char *); for( i = vector; *i; i++,j++ ) { job_desc->environment[j + env_offset] = fsd_strdup(*i); fsd_log_debug((" %s", job_desc->environment[j+ env_offset])); } } /* wall clock time hard limit */ value = jt->get_attr( jt, DRMAA_WCT_HLIMIT ); if (value) { job_desc->time_limit = slurmdrmaa_datetime_parse( value ); fsd_log_debug(("# wct_hlimit = %s -> %ld",value, (long int)slurmdrmaa_datetime_parse( value ))); } /*expand->set(expand, FSD_DRMAA_PH_INCR,fsd_asprintf("%d", n_job));*/ /* set current value */ /* TODO: test drmaa_ph_incr */ /* job working directory */ value = jt->get_attr( jt, DRMAA_WD ); if( value ) { char *cwd_expanded = expand->expand( expand, fsd_strdup(value), FSD_DRMAA_PH_HD | FSD_DRMAA_PH_INCR ); expand->set( expand, FSD_DRMAA_PH_WD, fsd_strdup(cwd_expanded)); fsd_log_debug(("# work_dir = %s",cwd_expanded)); job_desc->work_dir = fsd_strdup(cwd_expanded); fsd_free(cwd_expanded); } else { char cwdbuf[4096] = ""; if ((getcwd(cwdbuf, 4095)) == NULL) { char errbuf[256] = "InternalError"; (void)strerror_r(errno, errbuf, 256); /*on error the default message would be returned */ fsd_log_error(("getcwd failed: %s", errbuf)); job_desc->work_dir = fsd_strdup("."); } else { job_desc->work_dir = fsd_strdup(cwdbuf); } fsd_log_debug(("work_dir(default:CWD) %s", job_desc->work_dir)); } TRY { /* input path */ input_path_orig = jt->get_attr( jt, DRMAA_INPUT_PATH ); if( input_path_orig ) { input_path = internal_map_file( expand, input_path_orig, &input_host,"input" ); fsd_log_debug(( "\n drmaa_input_path: %s -> %s", input_path_orig, input_path )); } /* output path */ output_path_orig = jt->get_attr( jt, DRMAA_OUTPUT_PATH ); if( output_path_orig ) { output_path = internal_map_file( expand, output_path_orig, &output_host,"output" ); fsd_log_debug(( "\n drmaa_output_path: %s -> %s", output_path_orig, output_path )); } /* error path */ error_path_orig = jt->get_attr( jt, DRMAA_ERROR_PATH ); if( error_path_orig ) { error_path = internal_map_file( expand, error_path_orig, &error_host,"error" ); fsd_log_debug(( "\n drmaa_error_path: %s -> %s", error_path_orig, error_path )); } /* join files */ value = jt->get_attr( jt, DRMAA_JOIN_FILES ); if( value ) { if( (value[0] == 'y' || value[0] == 'Y') && value[1] == '\0' ) join_files = true; else if( (value[0] == 'n' || value[0] == 'N') && value[1] == '\0' ) join_files = false; else fsd_exc_raise_msg( FSD_DRMAA_ERRNO_INVALID_ATTRIBUTE_VALUE, "invalid value of drmaa_join_files attribute" ); } if( join_files ) { if( output_path == NULL ) fsd_exc_raise_msg(FSD_DRMAA_ERRNO_CONFLICTING_ATTRIBUTE_VALUES, "drmaa_join_files is set and output file is not given" ); if( error_path!=NULL && 0 != strcmp( output_path, error_path ) ) fsd_log_warning(( "Error file was given but will be ignored since drmaa_join_files was set." )); if (error_path) fsd_free(error_path); error_path = fsd_strdup(output_path); } else { if( error_path == NULL && output_path ) error_path = fsd_strdup( "/dev/null" ); if( output_path == NULL && error_path ) output_path = fsd_strdup( "/dev/null" ); } /* email addresses to send notifications */ vector = jt->get_v_attr( jt, DRMAA_V_EMAIL ); if( vector && vector[0] ) { /* only to one email address message may be send */ job_desc->mail_user = fsd_strdup(vector[0]); job_desc->mail_type = MAIL_JOB_BEGIN | MAIL_JOB_END | MAIL_JOB_FAIL; fsd_log_debug(("# mail_user = %s\n",vector[0])); fsd_log_debug(("# mail_type = %o\n",job_desc->mail_type)); if( vector[1] != NULL ) { fsd_log_error(( "SLURM only supports one e-mail notification address" )); fsd_exc_raise_msg(FSD_DRMAA_ERRNO_INVALID_ATTRIBUTE_VALUE,"SLURM only supports one e-mail notification address"); } } /* block email */ value = jt->get_attr( jt, DRMAA_BLOCK_EMAIL ); if( value ) { bool block; if( strcmp(value, "0") == 0 ) { block = true; fsd_log_debug(("# block_email = true")); fsd_log_debug(("# mail_user delated")); fsd_free(job_desc->mail_user); job_desc->mail_user = NULL; } else if( strcmp(value, "1") == 0 ) block = false; else fsd_exc_raise_msg(FSD_DRMAA_ERRNO_INVALID_ATTRIBUTE_VALUE,"invalid value of drmaa_block_email attribute" ); if( block && output_path == NULL ) { fsd_log_debug(( "output path not set and we want to block e-mail, set to /dev/null" )); output_path = fsd_strdup( "/dev/null" ); } } if( input_path ) { job_desc->std_in = fsd_strdup(input_path); fsd_log_debug(("# input = %s", input_path)); } if( output_path ) { job_desc->std_out = fsd_strdup(output_path); fsd_log_debug(("# output = %s", output_path)); } if( error_path ) { job_desc->std_err = fsd_strdup(error_path); fsd_log_debug(("# error = %s", error_path)); } } FINALLY { fsd_free( input_path ); fsd_free( output_path ); fsd_free( error_path ); input_path = NULL; output_path = NULL; error_path = NULL; } END_TRY /* job category */ value = jt->get_attr( jt, DRMAA_JOB_CATEGORY ); if( value ) job_category = value; { fsd_conf_option_t *category_value = NULL; category_value = fsd_conf_dict_get( session->job_categories, job_category ); if( category_value != NULL ) { if( category_value->type != FSD_CONF_STRING ) fsd_exc_raise_fmt( FSD_ERRNO_INTERNAL_ERROR, "configuration error: job category should be string" ); fsd_log_debug(("# Job category %s : %s\n",value,category_value->val.string)); slurmdrmaa_parse_native(job_desc,category_value->val.string); } else { if( value != NULL ) fsd_exc_raise_fmt( FSD_DRMAA_ERRNO_INVALID_ATTRIBUTE_VALUE, "invalid job category: %s", job_category ); } } /* set defaults for constraints - ref: slurm.h */ fsd_log_debug(("# Setting defaults for tasks and processors" )); job_desc->num_tasks = 1; job_desc->min_cpus = 0; job_desc->cpus_per_task = 0; job_desc->pn_min_cpus = 0; /* native specification */ value = jt->get_attr( jt, DRMAA_NATIVE_SPECIFICATION ); if( value ) { fsd_log_debug(("# Native specification: %s\n", value)); slurmdrmaa_parse_native(job_desc, value); } }
static void slurmdrmaa_job_update_status( fsd_job_t *self ) { job_info_msg_t *job_info = NULL; slurmdrmaa_job_t * slurm_self = (slurmdrmaa_job_t *) self; fsd_log_enter(( "({job_id=%s})", self->job_id )); fsd_mutex_lock( &self->session->drm_connection_mutex ); TRY { if ( slurm_load_job( &job_info, fsd_atoi(self->job_id), SHOW_ALL) ) { int _slurm_errno = slurm_get_errno(); if (_slurm_errno == ESLURM_INVALID_JOB_ID) { self->on_missing(self); } else { fsd_exc_raise_fmt(FSD_ERRNO_INTERNAL_ERROR,"slurm_load_jobs error: %s,job_id: %s", slurm_strerror(slurm_get_errno()), self->job_id); } } if (job_info) { fsd_log_debug(("state = %d, state_reason = %d", job_info->job_array[0].job_state, job_info->job_array[0].state_reason)); switch(job_info->job_array[0].job_state & JOB_STATE_BASE) { case JOB_PENDING: switch(job_info->job_array[0].state_reason) { #if SLURM_VERSION_NUMBER >= SLURM_VERSION_NUM(2,2,0) case WAIT_HELD_USER: /* job is held by user */ fsd_log_debug(("interpreting as DRMAA_PS_USER_ON_HOLD")); self->state = DRMAA_PS_USER_ON_HOLD; break; #endif case WAIT_HELD: /* job is held by administrator */ fsd_log_debug(("interpreting as DRMAA_PS_SYSTEM_ON_HOLD")); self->state = DRMAA_PS_SYSTEM_ON_HOLD; break; default: fsd_log_debug(("interpreting as DRMAA_PS_QUEUED_ACTIVE")); self->state = DRMAA_PS_QUEUED_ACTIVE; } break; case JOB_RUNNING: fsd_log_debug(("interpreting as DRMAA_PS_RUNNING")); self->state = DRMAA_PS_RUNNING; break; case JOB_SUSPENDED: if(slurm_self->user_suspended == true) { fsd_log_debug(("interpreting as DRMAA_PS_USER_SUSPENDED")); self->state = DRMAA_PS_USER_SUSPENDED; } else { fsd_log_debug(("interpreting as DRMAA_PS_SYSTEM_SUSPENDED")); self->state = DRMAA_PS_SYSTEM_SUSPENDED; } break; case JOB_COMPLETE: fsd_log_debug(("interpreting as DRMAA_PS_DONE")); self->state = DRMAA_PS_DONE; self->exit_status = job_info->job_array[0].exit_code; fsd_log_debug(("exit_status = %d -> %d",self->exit_status, WEXITSTATUS(self->exit_status))); break; case JOB_CANCELLED: fsd_log_debug(("interpreting as DRMAA_PS_FAILED (aborted)")); self->state = DRMAA_PS_FAILED; self->exit_status = -1; case JOB_FAILED: case JOB_TIMEOUT: case JOB_NODE_FAIL: #if SLURM_VERSION_NUMBER >= SLURM_VERSION_NUM(2,3,0) case JOB_PREEMPTED: #endif fsd_log_debug(("interpreting as DRMAA_PS_FAILED")); self->state = DRMAA_PS_FAILED; self->exit_status = job_info->job_array[0].exit_code; fsd_log_debug(("exit_status = %d -> %d",self->exit_status, WEXITSTATUS(self->exit_status))); break; default: /*unknown state */ fsd_log_error(("Unknown job state: %d. Please send bug report: http://apps.man.poznan.pl/trac/slurm-drmaa", job_info->job_array[0].job_state)); } if (job_info->job_array[0].job_state & JOB_STATE_FLAGS & JOB_COMPLETING) { fsd_log_debug(("Epilog completing")); } if (job_info->job_array[0].job_state & JOB_STATE_FLAGS & JOB_CONFIGURING) { fsd_log_debug(("Nodes booting")); } if (self->exit_status == -1) /* input,output,error path failure etc*/ self->state = DRMAA_PS_FAILED; self->last_update_time = time(NULL); if( self->state >= DRMAA_PS_DONE ) { fsd_log_debug(("exit_status = %d, WEXITSTATUS(exit_status) = %d", self->exit_status, WEXITSTATUS(self->exit_status))); fsd_cond_broadcast( &self->status_cond ); } } } FINALLY { if(job_info != NULL) slurm_free_job_info_msg (job_info); fsd_mutex_unlock( &self->session->drm_connection_mutex ); } END_TRY fsd_log_return(( "" )); }
void fsd_drmaa_session_wait_for_single_job( fsd_drmaa_session_t *self, const char *job_id, const struct timespec *timeout, int *status, fsd_iter_t **rusage, bool dispose ) { fsd_job_t *volatile job = NULL; volatile bool locked = false; fsd_log_enter(( "(%s)", job_id )); TRY { job = self->get_job( self, job_id ); if( job == NULL ) fsd_exc_raise_fmt( FSD_DRMAA_ERRNO_INVALID_JOB, "Job '%s' not found in DRMS queue", job_id ); job->update_status( job ); while( !self->destroy_requested && job->state < DRMAA_PS_DONE ) { bool signaled = true; fsd_log_debug(( "fsd_drmaa_session_wait_for_single_job: " "waiting for %s to terminate", job_id )); if( self->enable_wait_thread ) { if( timeout ) signaled = fsd_cond_timedwait( &job->status_cond, &job->mutex, timeout ); else { fsd_cond_wait( &job->status_cond, &job->mutex ); } if( !signaled ) fsd_exc_raise_code( FSD_DRMAA_ERRNO_EXIT_TIMEOUT ); } else { self->wait_for_job_status_change( self, &job->status_cond, &job->mutex, timeout ); } fsd_log_debug(( "fsd_drmaa_session_wait_for_single_job: woken up" )); if( !self->enable_wait_thread ) job->update_status( job ); } if( self->destroy_requested ) fsd_exc_raise_code( FSD_DRMAA_ERRNO_EXIT_TIMEOUT ); job->get_termination_status( job, status, rusage ); if( dispose ) { job->release( job ); /*release mutex in order to ensure proper order of locking: first job_set mutex then job mutex */ locked = fsd_mutex_lock( &self->mutex ); job = self->get_job( self, job_id ); if (job != NULL) { self->jobs->remove( self->jobs, job ); job->flags |= FSD_JOB_DISPOSED; } else { fsd_log_error(("Some other thread has already reaped job %s", job_id )); } locked = fsd_mutex_unlock( &self->mutex ); } } FINALLY { if ( job ) job->release( job ); if ( locked ) fsd_mutex_unlock( &self->mutex ); } END_TRY fsd_log_return(("")); }
fsd_drmaa_session_t * fsd_drmaa_session_new( const char *contact ) { fsd_drmaa_session_t *volatile self = NULL; fsd_log_enter(( "(%s)", contact )); TRY { fsd_malloc( self, fsd_drmaa_session_t ); self->release = fsd_drmaa_session_release; self->destroy = fsd_drmaa_session_destroy; self->destroy_nowait = fsd_drmaa_session_destroy_nowait; self->run_job = fsd_drmaa_session_run_job; self->run_bulk = fsd_drmaa_session_run_bulk; self->control_job = fsd_drmaa_session_control_job; self->job_ps = fsd_drmaa_session_job_ps; self->synchronize = fsd_drmaa_session_synchronize; self->wait = fsd_drmaa_session_wait; self->new_job = fsd_drmaa_session_new_job; self->run_impl = fsd_drmaa_session_run_impl; self->wait_for_single_job = fsd_drmaa_session_wait_for_single_job; self->wait_for_any_job = fsd_drmaa_session_wait_for_any_job; self->wait_for_job_status_change = fsd_drmaa_session_wait_for_job_status_change; self->wait_thread = fsd_drmaa_session_wait_thread; self->stop_wait_thread = fsd_drmaa_session_stop_wait_thread; self->update_all_jobs_status = fsd_drmaa_session_update_all_jobs_status; self->get_submited_job_ids = fsd_drmaa_session_get_submited_job_ids; self->get_job = fsd_drmaa_session_get_job; self->load_configuration = fsd_drmaa_session_load_configuration; self->read_configuration = fsd_drmaa_session_read_configuration; self->apply_configuration = fsd_drmaa_session_apply_configuration; self->ref_cnt = 1; self->destroy_requested = false; self->contact = NULL; self->jobs = NULL; self->configuration = NULL; self->pool_delay.tv_sec = 5; self->pool_delay.tv_nsec = 0; self->cache_job_state = 0; self->enable_wait_thread = true; self->job_categories = NULL; self->missing_jobs = FSD_REVEAL_MISSING_JOBS; self->wait_thread_started = false; self->wait_thread_run_flag = false; fsd_mutex_init( &self->mutex ); fsd_cond_init( &self->wait_condition ); fsd_cond_init( &self->destroy_condition ); fsd_mutex_init( &self->drm_connection_mutex ); self->jobs = fsd_job_set_new(); self->contact = fsd_strdup( contact ); } EXCEPT_DEFAULT { if( self != NULL ) self->destroy( self ); fsd_exc_reraise(); } END_TRY fsd_log_debug(("sizeof(fsd_drmaa_session_t)=%d", sizeof(fsd_drmaa_session_t))); return self; }
char * fsd_drmaa_session_wait_for_any_job( fsd_drmaa_session_t *self, const struct timespec *timeout, int *status, fsd_iter_t **rusage, bool dispose ) { fsd_job_set_t *set = self->jobs; fsd_job_t *volatile job = NULL; char *volatile job_id = NULL; volatile bool locked = false; fsd_log_enter(( "" )); TRY { while( job == NULL ) { bool signaled = true; if( self->destroy_requested ) fsd_exc_raise_code( FSD_DRMAA_ERRNO_NO_ACTIVE_SESSION ); if( !self->enable_wait_thread ) self->update_all_jobs_status( self ); locked = fsd_mutex_lock( &self->mutex ); if( set->empty( set ) ) fsd_exc_raise_msg( FSD_DRMAA_ERRNO_INVALID_JOB, "No job found to be waited for" ); if( (job = set->find_terminated( set )) != NULL ) break; if( self->destroy_requested ) fsd_exc_raise_code( FSD_DRMAA_ERRNO_NO_ACTIVE_SESSION ); if( self->enable_wait_thread ) { fsd_log_debug(( "wait_for_any_job: waiting for wait thread" )); if( timeout ) signaled = fsd_cond_timedwait( &self->wait_condition, &self->mutex, timeout ); else fsd_cond_wait( &self->wait_condition, &self->mutex ); } else { fsd_log_debug(( "wait_for_any_job: waiting for next check" )); self->wait_for_job_status_change( self, &self->wait_condition, &self->mutex, timeout ); } locked = fsd_mutex_unlock( &self->mutex ); fsd_log_debug(( "wait_for_any_job: woken up; signaled=%d", signaled )); if( !signaled ) fsd_exc_raise_code( FSD_DRMAA_ERRNO_EXIT_TIMEOUT ); } fsd_log_debug(( "wait_for_any_job: waiting finished" )); job_id = fsd_strdup( job->job_id ); job->get_termination_status( job, status, rusage ); } EXCEPT_DEFAULT { if( job_id ) fsd_free( job_id ); fsd_exc_reraise(); } FINALLY { if( job ) { if( fsd_exc_get() == NULL && dispose ) { set->remove( set, job ); job->flags |= FSD_JOB_DISPOSED; } job->release( job ); } if( locked ) fsd_mutex_unlock( &self->mutex ); } END_TRY fsd_log_return(( " =%s", job_id )); return job_id; }