예제 #1
0
void
fsd_drmaa_session_stop_wait_thread( fsd_drmaa_session_t *self )
{
	volatile int lock_count = 0;
	fsd_log_enter(( "" ));
	fsd_mutex_lock( &self->mutex );
	TRY
	 {
		fsd_log_debug(("started = %d  run_flag = %d", self->wait_thread_started, self->wait_thread_run_flag ));
		if( self->wait_thread_started )
		 {
			self->wait_thread_run_flag = false;
			fsd_log_debug(("started = %d  run_flag = %d", self->wait_thread_started, self->wait_thread_run_flag ));
			fsd_cond_broadcast( &self->wait_condition );
			TRY
			 {
				lock_count = fsd_mutex_unlock_times( &self->mutex );
				fsd_thread_join( self->wait_thread_handle, NULL );
			 }
			FINALLY
			 {
				int i;
				for( i = 0;  i < lock_count;  i++ )
					fsd_mutex_lock( &self->mutex );
			 }
			END_TRY
			self->wait_thread_started = false;
		 }

	 }
void
fsd_log_stacktrace( int skip, int limit )
{
	void **ptr_buf = NULL;
	const char **symbols = NULL;
	int i, n;

	if( limit == 0 )
		limit = 128;
	skip++; /* without fsd_log_stacktrace() frame */
	n = skip + limit;

	ptr_buf = (void**)calloc( n, sizeof(void*) );
	if( ptr_buf == NULL )
		return;
	n = backtrace( ptr_buf, n );
	symbols = (const char**)backtrace_symbols( ptr_buf, n );
	if( symbols != NULL )
	 {
		fsd_log_debug(( "Stacktrace (most recent call last):" ));
		for( i = n-skip;  i >= 0;  i-- )
			fsd_log_debug(( "\n  %s", symbols[i] ));
		free( symbols );
	 }
}
예제 #3
0
static void
lsfdrmaa_job_update_status( fsd_job_t *self )
{
	lsfdrmaa_job_t *lsf_self = (lsfdrmaa_job_t*)self;
	struct jobInfoEnt *volatile job_info = NULL;
	bool job_in_queue;	
	
	fsd_log_enter(( "({job_id=%s, time_delta=%d})", self->job_id, time(NULL) - self->submit_time ));
	do {
		fsd_mutex_lock( &self->session->drm_connection_mutex );
		TRY
		 {
			int n_records;
			int more;
			char * username = (lsf_self->int_job_id>0)?"all":NULL;

			fsd_log_debug(( "drm connection locked" ));

			n_records = lsb_openjobinfo( lsf_self->int_job_id,
						NULL, username, NULL, NULL, ALL_JOB );
			fsd_log_debug((
						"lsb_openjobinfo( %d[%d], NULL, %s, NULL, NULL, ALL_JOB ) =%d",
						LSB_ARRAY_JOBID(lsf_self->int_job_id),
						LSB_ARRAY_IDX(lsf_self->int_job_id),
						username?username:"******",
						n_records ));
			
						job_in_queue = n_records > 0;
			
						if(!job_in_queue){
				if(!(self->flags & FSD_JOB_CURRENT_SESSION)){
									fsd_exc_raise_code( FSD_DRMAA_ERRNO_INVALID_JOB );
								 }else{/*handling missing job*/
									 self->on_missing(self);
								 }
						}else{
								job_info = lsb_readjobinfo( &more );
				fsd_log_debug(( "lsb_readjobinfo(...) =%p: more=%d",
							(void*)job_info, more ));
				if( job_info == NULL )
					fsd_exc_raise_lsf( "lsb_readjobinfo" );
				lsf_self->read_job_info( self, job_info );
						}
		 }
		FINALLY
		 {
			/* lsfdrmaa_free_job_info( job_info ); */
			lsb_closejobinfo();
			fsd_log_debug(( "lsb_closejobinfo()" ));
			fsd_mutex_unlock( &self->session->drm_connection_mutex );
		 }
		END_TRY

	} while( !job_in_queue );
	fsd_log_return(( "" ));
}
예제 #4
0
void
fsd_drmaa_session_wait_for_job_status_change(
		fsd_drmaa_session_t *self,
		fsd_cond_t *wait_condition,
		fsd_mutex_t *mutex,
		const struct timespec *timeout
		)
{
	struct timespec ts, *next_check = &ts;
	bool status_changed;

	if( timeout )
		fsd_log_enter((
					"(timeout=%ld.%09ld)",
					timeout->tv_sec, timeout->tv_nsec ));
	else
		fsd_log_enter(( "(timeout=(null))" ));
	fsd_get_time( next_check );
	fsd_ts_add( next_check, &self->pool_delay );
	if( timeout  &&  fsd_ts_cmp( timeout, next_check ) < 0 )
		next_check = (struct timespec*)timeout;
	fsd_log_debug(( "wait_for_job_status_change: waiting untill %ld.%09ld",
				next_check->tv_sec, next_check->tv_nsec ));
	status_changed = fsd_cond_timedwait(
			wait_condition, mutex, next_check );
	if( !status_changed  &&  next_check == timeout )
		fsd_exc_raise_code( FSD_DRMAA_ERRNO_EXIT_TIMEOUT );

	fsd_log_return(( ": next_check=%ld.%09ld, status_changed=%d",
				next_check->tv_sec, next_check->tv_nsec,
				(int)status_changed
				));
}
예제 #5
0
파일: drmaa.c 프로젝트: RichardChen3511/oar
static int
oardrmaa_wifaborted(
		int *aborted, int stat,
		char *error_diagnosis, size_t error_diag_len
		)
{
  fsd_log_info(("wifaborted(%d)>>>>", stat));
	fsd_log_debug(("wifaborted(%d)", stat));

	if ( stat == -1 )
	 {
		*aborted = true;
	 }
	else if ( stat <= 125 )
	 {
		*aborted = false;
	 }
	else if ( stat == 126 || stat == 127 )
         {
		*aborted = true;
	 } 
	else switch( stat & 0x7f )
	 {
		case SIGTERM:  case SIGKILL:
			*aborted = true;
			break;
		default:
			*aborted = false;
			break;
	 }
	return DRMAA_ERRNO_SUCCESS;
}
예제 #6
0
static void
lsfdrmaa_job_control( fsd_job_t *self, int action )
{
	/*
	 * XXX: waiting for job state change was removed
	 * since it is not required for drmaa_control
	 * to return after change completes.
	 */
	lsfdrmaa_job_t *lsf_self = (lsfdrmaa_job_t*)self;
	LS_LONG_INT job_id;
	int signal;

	fsd_log_enter(( "({job_id=%s}, action=%d)", self->job_id, action ));
	job_id = lsf_self->int_job_id;
	switch( action )
	 {
		case DRMAA_CONTROL_SUSPEND:
		case DRMAA_CONTROL_HOLD:
			signal = SIGSTOP;
			break;
		case DRMAA_CONTROL_RESUME:
		case DRMAA_CONTROL_RELEASE:
			signal = SIGCONT;
			break;
		case DRMAA_CONTROL_TERMINATE:
			/* TODO: sending SIGTERM (configurable)? */
			signal = SIGKILL;
			break;
		default:
			fsd_exc_raise_fmt(
					FSD_ERRNO_INVALID_ARGUMENT,
					"job::control: unknown action %d", action );
	 }

	fsd_mutex_lock( &self->session->drm_connection_mutex );
	TRY
	 {
		int rc = lsb_signaljob( lsf_self->int_job_id, signal );
		fsd_log_debug(( "lsb_signaljob( %d[%d], %d ) = %d",
					LSB_ARRAY_JOBID(lsf_self->int_job_id),
					LSB_ARRAY_IDX(lsf_self->int_job_id),
					signal, rc ));
		if( rc < 0 )
			fsd_exc_raise_fmt(
					FSD_ERRNO_INTERNAL_ERROR,
					"job::control: could not send %s to job %s",
					fsd_strsignal( signal ), self->job_id
					);
	 }
	FINALLY
	 {
		fsd_mutex_unlock( &self->session->drm_connection_mutex );
	 }
	END_TRY

	fsd_log_return(( "" ));
}
예제 #7
0
void
fsd_drmaa_session_destroy( fsd_drmaa_session_t *self )
{
	bool already_destroying = false;

	fsd_log_enter(( "" ));
	fsd_mutex_lock( &self->mutex );
	TRY
	 {
		if( self->destroy_requested )
			already_destroying = true;
		else
		 {
			self->destroy_requested = true;
			fsd_cond_broadcast( &self->wait_condition );
		 }
	 }
	FINALLY
	 { fsd_mutex_unlock( &self->mutex ); }
	END_TRY

	if( already_destroying )
	 { /* XXX: actually it can not happen in current implementation
				when using DRMAA API */
		self->release( self );
		fsd_exc_raise_code( FSD_DRMAA_ERRNO_NO_ACTIVE_SESSION );
	 }

	self->jobs->signal_all( self->jobs );

	fsd_mutex_lock( &self->mutex );
	TRY
	 {
		while( self->ref_cnt > 1 )
			fsd_cond_wait( &self->destroy_condition, &self->mutex );
		fsd_log_debug(("started = %d  run_flag = %d", self->wait_thread_started, self->wait_thread_run_flag ));
		if( self->wait_thread_started )
			self->stop_wait_thread( self );
	 }
	FINALLY
	 { fsd_mutex_unlock( &self->mutex ); }
	END_TRY

	self->destroy_nowait( self );
	fsd_log_return(( "" ));
}
예제 #8
0
void *
fsd_drmaa_session_wait_thread( fsd_drmaa_session_t *self )
{
	struct timespec ts, *next_check = &ts;
        bool volatile locked = false;

	fsd_log_enter(( "" ));
	locked = fsd_mutex_lock( &self->mutex );
	TRY
	 {
		while( self->wait_thread_run_flag )
			TRY
			 {
				fsd_log_debug(( "wait thread: next iteration" ));
				self->update_all_jobs_status( self );
				fsd_cond_broadcast( &self->wait_condition );
				
				fsd_get_time( next_check );
				fsd_ts_add( next_check, &self->pool_delay );
				fsd_cond_timedwait( &self->wait_condition, &self->mutex, next_check );
				
			 }
			EXCEPT_DEFAULT
			 {
				const fsd_exc_t *e = fsd_exc_get();
				fsd_log_error(( "wait thread: <%d:%s>", e->code(e), e->message(e) ));
			 }
			END_TRY
	 }
	FINALLY
	 { 
		if (locked)
			fsd_mutex_unlock( &self->mutex ); 
	 }
	END_TRY

	fsd_log_return(( " =NULL" ));
	return NULL;
}
예제 #9
0
fsd_iter_t *
slurmdrmaa_session_run_bulk(
		fsd_drmaa_session_t *self,
		const fsd_template_t *jt,
		int start, int end, int incr )
{
	int ret = 0;
	unsigned i = 0;
	int job_id = 0;
	int task_id = 0;
	fsd_job_t *volatile job = NULL;
	volatile unsigned n_jobs = (end - start) / incr + 1;
	char ** volatile job_ids = fsd_calloc( job_ids, n_jobs + 1, char* );
	volatile bool connection_lock = false;
	fsd_environ_t *volatile env = NULL;
	job_desc_msg_t job_desc;
	submit_response_msg_t *submit_response = NULL;

	slurmdrmaa_init_job_desc( &job_desc );

	TRY
	{
			connection_lock = fsd_mutex_lock( &self->drm_connection_mutex );
			slurmdrmaa_job_create_req( self, jt, (fsd_environ_t**)&env , &job_desc, 0 );

			/* Create job array spec if more than 1 task */
			if(n_jobs > 1)
			{
				fsd_calloc(job_desc.array_inx, ARRAY_INX_MAXLEN, char*);
				ret = snprintf(job_desc.array_inx, ARRAY_INX_MAXLEN, "%d-%d:%d", start, end, incr );
				if (ret < 0 || ret >= ARRAY_INX_MAXLEN) {
					fsd_exc_raise_fmt(FSD_ERRNO_INTERNAL_ERROR, "snprintf: not enough memory");
				}
				fsd_log_debug(("array job '%s' prepared", job_desc.array_inx));
			}

			/* Submit the batch job */
			if(slurm_submit_batch_job(&job_desc, &submit_response) != SLURM_SUCCESS){
				fsd_exc_raise_fmt(
					FSD_ERRNO_INTERNAL_ERROR,"slurm_submit_batch_job: %s",slurm_strerror(slurm_get_errno()));
			}

			connection_lock = fsd_mutex_unlock( &self->drm_connection_mutex );

			/* Watch each job in the array */
			for (i = 0; i < n_jobs; ++i) {
				job_id = (int) submit_response->job_id;
				task_id = start + i*incr;
				if (n_jobs > 1) {
					/* Array job */
					if (!working_cluster_rec)
						job_ids[i] = fsd_asprintf("%d_%d", job_id, task_id); /* .0*/
					else
						job_ids[i] = fsd_asprintf("%d_%d.%s", job_id, task_id, working_cluster_rec->name);
				} else {
					/* Single job */
					if (!working_cluster_rec)
						job_ids[i] = fsd_asprintf("%d", job_id); /* .0*/
					else
						job_ids[i] = fsd_asprintf("%d.%s", job_id, working_cluster_rec->name);
				}

				fsd_log_debug(("job %s submitted", job_ids[i]));
				job = slurmdrmaa_job_new( fsd_strdup(job_ids[i]) );
				job->session = self;
				job->submit_time = time(NULL);
				self->jobs->add( self->jobs, job );
				job->release( job );
				job = NULL;
			}

			if (working_cluster_rec)
				slurmdb_destroy_cluster_rec(working_cluster_rec);

			working_cluster_rec = NULL;
	 }
예제 #10
0
void
lsfdrmaa_job_set_req(
		fsd_drmaa_session_t *session,
		fsd_expand_drmaa_ph_t *expand,
		const fsd_template_t *jt,
		struct submit *req,
		fsd_environ_t **envp
		)
{
	const char *input_path_orig = NULL;
	const char *output_path_orig = NULL;
	const char *error_path_orig = NULL;
	char *volatile input_path = NULL;
	char *volatile output_path = NULL;
	char *volatile error_path = NULL;
	bool input_host = false;
	bool output_host = false;
	bool error_host = false;
	bool join_files = false;
	bool transfer_input = false;
	bool transfer_output = false;
	bool transfer_error = false;
	const char *job_category = "default";
	char **volatile argv = NULL;

	const char *value;
	const char *const *vector;

	/* set default lsf configs */
	 {
		int i = 0;
		req->options = 0;
		req->options2 = 0;
		for( i = 0;  i < LSF_RLIM_NLIMITS;  i++ )
			req->rLimits[i] = DEFAULT_RLIMIT;
		req->beginTime = 0;
		req->termTime = 0;
	 }

	/* job category */
	value = jt->get_attr( jt, DRMAA_JOB_CATEGORY );
	if( value )
		job_category = value;

	 {
		fsd_conf_option_t *category_value = NULL;
		category_value = fsd_conf_dict_get( session->job_categories, job_category );
		if( category_value != NULL )
		 {
			if( category_value->type != FSD_CONF_STRING )
				fsd_exc_raise_fmt(
						FSD_ERRNO_INTERNAL_ERROR,
						"configuration error: job category should be string"
						);
			lsfdrmaa_native_parse( category_value->val.string, req );
		 }
		else
		 {
			if( value != NULL )
				fsd_exc_raise_fmt(
						FSD_DRMAA_ERRNO_INVALID_ATTRIBUTE_VALUE,
						"invalid job category: %s", job_category
						);
		 }
	 }

	/* job working directory */
	value = jt->get_attr( jt, DRMAA_WD );
	if( value )
	 {
		char *cwd = NULL;
		cwd = expand->expand( expand, fsd_strdup(value),
				FSD_DRMAA_PH_HD | FSD_DRMAA_PH_INCR );
		expand->set( expand, FSD_DRMAA_PH_WD, cwd );
#ifdef SUB3_CWD
		req->cwd = fsd_strdup( cwd );
		req->options3 |= SUB3_CWD;
#else
		fsd_exc_raise_fmt(FSD_ERRNO_INTERNAL_ERROR, "DRMAA_WD attribute is not supported in this version of LSF.");
#endif
	 }

	TRY
	 {
		const char *command = NULL;
		unsigned n_args = 0;
		const char *const *i;
		int j;

		/* remote command */
		command = jt->get_attr( jt, DRMAA_REMOTE_COMMAND );
		if( command == NULL )
			fsd_exc_raise_msg(
					FSD_DRMAA_ERRNO_CONFLICTING_ATTRIBUTE_VALUES,
					"drmaa_remote_command not set for job template"
					);

		/* arguments list */
		vector = jt->get_v_attr( jt, DRMAA_V_ARGV );
		if( vector )
		 {
			for( i = vector;  *i;  i++ )
				n_args++;
		 }
		fsd_calloc( argv, n_args+3, char* );
		argv[0] = fsd_strdup("exec");
		argv[1] = expand->expand( expand, fsd_strdup(command),
				FSD_DRMAA_PH_HD | FSD_DRMAA_PH_WD );
		if( vector )
		 {
			for( i = vector, j = 2;  *i;  i++, j++ )
				argv[j] = expand->expand( expand, fsd_strdup(*i),
					FSD_DRMAA_PH_HD | FSD_DRMAA_PH_WD );
		 }

		req->command = lsfdrmaa_job_quote_command( (const char*const*)argv );
	 }
	FINALLY
	 {
		fsd_free_vector( argv );
	 }
	END_TRY

	/* job name */
	value = jt->get_attr( jt, DRMAA_JOB_NAME );
	if( value )
	 {
		req->jobName = fsd_strdup(value);
		req->options |= SUB_JOB_NAME;
	 }

	/* job state at submit */
	value = jt->get_attr( jt, DRMAA_JS_STATE );
	if( value )
	 {
		if( 0 == strcmp( value, DRMAA_SUBMISSION_STATE_ACTIVE ) )
			req->options2 &= !SUB2_HOLD;
		else if( 0 == strcmp( value, DRMAA_SUBMISSION_STATE_HOLD ) )
			req->options2 |= SUB2_HOLD;
		else
			fsd_exc_raise_msg(
					FSD_DRMAA_ERRNO_INVALID_ATTRIBUTE_VALUE,
					"invalid value of drmaa_js_state attribute" );
	 }

	/* environment */
	vector = jt->get_v_attr( jt, DRMAA_V_ENV );
	if( vector )
	 {
		fsd_environ_t *env;
		*envp = env = fsd_environ_new( NULL );
		env->update( env, vector );
	 }

	/* start time */
	value = jt->get_attr( jt, DRMAA_START_TIME );
	if( value )
	 {
		req->beginTime = fsd_datetime_parse( value );
		fsd_log_debug(( "\n  drmaa_start_time: %s -> %ld",
					value, (long)req->beginTime ));
	 }

	TRY
	 {
		/* input path */
		input_path_orig = jt->get_attr( jt, DRMAA_INPUT_PATH );
		if( input_path_orig )
		 {
			input_path = internal_map_file( expand, input_path_orig, &input_host,
							"input" );
			fsd_log_debug(( "\n  drmaa_input_path: %s -> %s",
						input_path_orig, input_path ));
		 }

		/* output path */
		output_path_orig = jt->get_attr( jt, DRMAA_OUTPUT_PATH );
		if( output_path_orig )
		 {
			output_path = internal_map_file( expand, output_path_orig, &output_host,
							"output" );
			fsd_log_debug(( "\n  drmaa_output_path: %s -> %s",
						output_path_orig, output_path ));
		 }

		/* error path */
		error_path_orig = jt->get_attr( jt, DRMAA_ERROR_PATH );
		if( error_path_orig )
		 {
			error_path = internal_map_file( expand, error_path_orig, &error_host,
							"error" );
			fsd_log_debug(( "\n  drmaa_error_path: %s -> %s",
						error_path_orig, error_path ));
		 }

		/* join files */
		value = jt->get_attr( jt, DRMAA_JOIN_FILES );
		if( value )
		 {
			if( (value[0] == 'y' || value[0] == 'Y')  &&  value[1] == '\0' )
				join_files = true;
			else if( (value[0] == 'n' || value[0] == 'N')  &&  value[1] == '\0' )
				join_files = false;
			else
				fsd_exc_raise_msg(
						FSD_DRMAA_ERRNO_INVALID_ATTRIBUTE_VALUE,
						"invalid value of drmaa_join_files attribute" );
		 }

		if( join_files )
		 {
			/*
			 * LSF by default joins output and error streams
			 * when error file is not set.
			 */
			if( error_path )
			 {
				if( output_path == NULL )
					fsd_exc_raise_msg(
							FSD_DRMAA_ERRNO_CONFLICTING_ATTRIBUTE_VALUES,
							"drmaa_join_files is set and output file is not given" );
				if( 0 != strcmp( output_path, error_path ) )
					fsd_log_warning(( "Error file was given but will be ignored "
								"since drmaa_join_files was set." ));
				fsd_free( error_path );  error_path = NULL;
			 }
		 }
		else
		 {
			/*
			 * If error path is not set, we must set it to /dev/null
			 * to prevent joining files.
			 */
			if( error_path == NULL  &&  output_path )
				error_path = fsd_strdup( "/dev/null" );
			if( output_path == NULL  &&  error_path )
				output_path = fsd_strdup( "/dev/null" );
			if( req->errFile == NULL )
			 {
				req->errFile = fsd_strdup( "/dev/null" );
				req->options |= SUB_ERR_FILE;
	#ifdef SUB2_OVERWRITE_ERR_FILE
				req->options2 &= ~SUB2_OVERWRITE_ERR_FILE;
	#endif
			 }
		 }

		/* transfer files */
		value = jt->get_attr( jt, DRMAA_TRANSFER_FILES );
		if( value )
		 {
			const char *i;
			for( i = value;  *i;  i++ )
			 {
				switch( *i )
				 {
					case 'i':  transfer_input = true;  break;
					case 'o':  transfer_output = true;  break;
					case 'e':  transfer_error = true;  break;
					default:
						fsd_exc_raise_fmt(
								FSD_DRMAA_ERRNO_INVALID_ATTRIBUTE_VALUE,
								"invalid character '%c' in drmaa_transfer_files: %s",
								*i, value
								);
				 }
			 }
		 }

#	if 0
		 {
			/*
			 * Input file is transfered by LSF from submission host whenever
			 * it isn't found on execution host regardless of explicit file transfers.
			 * When drmaa_transfer_files contains ``i`` input file is send
			 * explicitly because it may be outdated or otherwise differ.
			 */
			static const char *name[3]
				= {"input", "output", "error"};
			static const int options[3]
				= { XF_OP_SUB2EXEC, XF_OP_EXEC2SUB, XF_OP_EXEC2SUB };
			const char *path[3];
			bool host[3], transfer[3];
			int i;

			path[i=0] = input_path;
			path[++i] = output_path;
			path[++i] = error_path;
			host[i=0] = input_host;
			host[++i] = output_host;
			host[++i] = error_host;
			transfer[i=0] = transfer_input;
			transfer[++i] = transfer_output;
			transfer[++i] = transfer_error;

			for( i = 0;  i < 3;  i++ )
			 {
				struct xFile *t;
				if( !(transfer[i]  &&  path[i] != NULL) )
					continue;
				if( 0 == strcmp( path[i], "/dev/null" ) )
					continue;
				if( host[i] )
					fsd_log_warning((
								"hostname in drmaa_%s_path ignored", name[i] ));
				fsd_log_debug(( "setting transfer of %s file (%s) "
							"to execution host", name[i], path[i] ));
				fsd_realloc( req->xf, req->nxf+1, struct xFile );
				t = &req->xf[ req->nxf++ ];
				memset( t, 0, sizeof(struct xFile) );
				if( sizeof(t->subFn) == MAXFILENAMELEN )
				 { /* LSF 6 */
					strlcpy( t->subFn, path[i], MAXFILENAMELEN );
					strlcpy( t->execFn, path[i], MAXFILENAMELEN );
				 }
				else
				 { /* LSF 7 */
					*(char**)&t->subFn = fsd_strdup( path[i] );
					*(char**)&t->execFn = fsd_strdup( path[i] );
				 }
				t->options = options[i];
			 }

			if( req->nxf > 0 )
				req->options |= SUB_OTHER_FILES;
		 }
#	endif /* transfer files */

		/* email addresses to send notifications */
		vector = jt->get_v_attr( jt, DRMAA_V_EMAIL );
		if( vector  &&  vector[0] )
		 {
			/* only to one email address message may be send */
			req->mailUser = fsd_strdup( vector[0] );
			req->options |= SUB_MAIL_USER | SUB_NOTIFY_END;
	#if 0
			if( vector[1] != NULL )
				fsd_log_warning(( "LSF only supports one e-mail "
							"notification address" ));
	#endif
		 }

		/* block email */
		value = jt->get_attr( jt, DRMAA_BLOCK_EMAIL );
		if( value )
		 {
			bool block;
			if( strcmp(value, "1") == 0 )
				block = true;
			else if( strcmp(value, "0") == 0 )
				block = false;
			else
				fsd_exc_raise_msg(
						FSD_DRMAA_ERRNO_INVALID_ATTRIBUTE_VALUE,
						"invalid value of drmaa_block_email attribute" );
			if( block )
			 {
				if( output_path == NULL )
				 {
					fsd_log_debug(( "output path not set and we want to block e-mail, "
								"set to /dev/null" ));
					output_path = fsd_strdup( "/dev/null" );
				 }
				req->options &= ~SUB_NOTIFY_END;
			 }
			else
			 {
				/* SUB_NOTIFY_END should force sending e-mail even if outfile is set */
				req->options |= SUB_NOTIFY_END;
			 }
		 }

		if( !((lsfdrmaa_session_t*)session)->prepand_report_to_output
				&&  (req->options & SUB_NOTIFY_END) == 0
				&&  output_path != NULL )
		 {
			req->options |= SUB_MAIL_USER | SUB_NOTIFY_END;
			fsd_free( req->mailUser );  /* when email was set
				but notification was blocked */
			req->mailUser = fsd_strdup( "notexistent" );
		 }

		if( input_path )
		 {
			req->inFile = input_path;
			req->options |= SUB_IN_FILE;
			input_path = NULL;
		 }

		if( output_path )
		 {
			req->outFile = output_path;
			req->options |= SUB_OUT_FILE;
	#ifdef SUB2_OVERWRITE_OUT_FILE
			if( 0 != strcmp( output_path, "/dev/null" ) )
				req->options2 |= SUB2_OVERWRITE_OUT_FILE;
	#endif
			output_path = NULL;
		 }

		if( error_path )
		 {
			req->errFile = error_path;
			req->options |= SUB_ERR_FILE;
	#ifdef SUB2_OVERWRITE_ERR_FILE
			if( 0 != strcmp( error_path, "/dev/null" ) )
				req->options2 |= SUB2_OVERWRITE_ERR_FILE;
	#endif
			error_path = NULL;
		 }
	 }
	FINALLY
	 {
		fsd_free( input_path );
		fsd_free( output_path );
		fsd_free( error_path );
	 }
	END_TRY


	/* deadline time */
	value = jt->get_attr( jt, DRMAA_DEADLINE_TIME );
	if( value )
		req->termTime = fsd_datetime_parse( value );

	/* wall clock time hard limit */
	value = jt->get_attr( jt, DRMAA_WCT_HLIMIT );
	if( value )
		req->rLimits[ LSF_RLIMIT_RUN ] = fsd_parse_timedelta( value );

	/* wall clock time soft limit */
#ifdef SUB3_RUNTIME_ESTIMATION
	value = jt->get_attr( jt, DRMAA_WCT_SLIMIT );
	if( value )
	 {
		req->options3 = SUB3_RUNTIME_ESTIMATION;
		req->runtimeEstimation = fsd_parse_timedelta( value );
	 }
#endif

	/* duration hard limit */
	value = jt->get_attr( jt, DRMAA_DURATION_HLIMIT );
	if( value )
		req->rLimits[ LSF_RLIMIT_CPU ] = fsd_parse_timedelta( value );

	/* native specification */
	value = jt->get_attr( jt, DRMAA_NATIVE_SPECIFICATION );
	if( value )
		lsfdrmaa_native_parse( value, req );

	lsfdrmaa_dump_submit_req(req);
}
예제 #11
0
static void
lsfdrmaa_job_read_job_info( fsd_job_t *self, struct jobInfoEnt *job_info )
{
	int status, flags;

	fsd_log_enter(( "" ));
	 {
		int i;
		fsd_log_debug(( "job status of %s updated from %d[%d]",
					self->job_id,
					LSB_ARRAY_JOBID(job_info->jobId),
					LSB_ARRAY_IDX(job_info->jobId) ));
		fsd_log_debug(( "\n  status: 0x%x", job_info->status ));
		fsd_log_debug(( "\n  submitTime: %ld", job_info->submitTime ));
		fsd_log_debug(( "\n  startTime: %ld", job_info->startTime ));
		fsd_log_debug(( "\n  endTime: %ld", job_info->startTime ));
		fsd_log_debug(( "\n  duration: %d", job_info->duration ));
		fsd_log_debug(( "\n  cpuTime: %f", job_info->cpuTime ));
		fsd_log_debug(( "\n  cwd: %s", job_info->cwd ));
		fsd_log_debug(( "\n  fromHost: %s", job_info->fromHost ));
		fsd_log_debug(( "\n  numExHosts: %d", job_info->numExHosts ));
		for( i = 0;  i < job_info->numExHosts;  i++ )
			fsd_log_debug(( "\n  exHosts[%d]: %s", i, job_info->exHosts[i] ));
		fsd_log_debug(( "\n  exitStatus: %d", job_info->exitStatus ));
		fsd_log_debug(( "\n  execCwd: %s", job_info->execCwd ));
		fsd_log_debug(( "\n  runRusage.mem: %d", job_info->runRusage.mem ));
		fsd_log_debug(( "\n  runRusage.swap: %d", job_info->runRusage.swap ));
		fsd_log_debug(( "\n  runRusage.utime: %d", job_info->runRusage.utime ));
		fsd_log_debug(( "\n  runRusage.stime: %d", job_info->runRusage.stime ));
		fsd_log_debug(( "\n  jName: %s", job_info->jName ));
		/* fsd_log_debug(( "\n  execRusage: %s", job_info->execRusage )); */
	 }

	status = job_info->status;

	flags = 0;
	if( status & (JOB_STAT_PEND | JOB_STAT_PSUSP) )
		flags |= FSD_JOB_QUEUED;
	if( status & JOB_STAT_PSUSP )
		flags |= FSD_JOB_HOLD;
	if( status & (JOB_STAT_RUN | JOB_STAT_USUSP | JOB_STAT_SSUSP) )
		flags |= FSD_JOB_RUNNING;
	if( status & (JOB_STAT_USUSP | JOB_STAT_SSUSP) )
		flags |= FSD_JOB_SUSPENDED;
	if( status & (JOB_STAT_DONE | JOB_STAT_EXIT) )
		flags |= FSD_JOB_TERMINATED;
	if( status & (JOB_STAT_EXIT | JOB_STAT_PERR) )
		flags |= FSD_JOB_ABORTED;
	self->flags &= ~(FSD_JOB_STATE_MASK | FSD_JOB_ABORTED);
	self->flags |= flags;

	if( status & (JOB_STAT_WAIT | JOB_STAT_PEND) )
		self->state = DRMAA_PS_QUEUED_ACTIVE;
	else if( status & JOB_STAT_PSUSP )
		self->state = DRMAA_PS_USER_ON_HOLD;
	else if( status & JOB_STAT_RUN )
		self->state = DRMAA_PS_RUNNING;
	else if( status & JOB_STAT_SSUSP )
		self->state = DRMAA_PS_SYSTEM_SUSPENDED;
	else if( status & JOB_STAT_USUSP )
		self->state = DRMAA_PS_USER_SUSPENDED;
	else if( status & JOB_STAT_DONE )
		self->state = DRMAA_PS_DONE;
	else if( status & JOB_STAT_EXIT )
		self->state = DRMAA_PS_FAILED;
	else if( status & JOB_STAT_PDONE )
		self->state = DRMAA_PS_DONE;
	else if( status & JOB_STAT_PERR )
		self->state = DRMAA_PS_FAILED;
	else if( status & JOB_STAT_UNKWN )
		self->state = DRMAA_PS_UNDETERMINED;
	else
		self->state = DRMAA_PS_FAILED;

	self->exit_status = job_info->exitStatus & ~0xff;
	if( (self->exit_status >> 8) == 0  &&  (job_info->status & JOB_STAT_EXIT) )
		self->exit_status |= 0x01;
	self->start_time = job_info->startTime;
	self->end_time = job_info->endTime;
	self->cpu_usage = job_info->cpuTime;
	self->mem_usage = max( self->mem_usage, 1024*job_info->runRusage.mem );
	self->vmem_usage = max( self->vmem_usage, 1024*job_info->runRusage.swap );
	self->walltime = 60*job_info->duration;
	self->n_execution_hosts = job_info->numExHosts;
	if( self->execution_hosts == NULL  &&  job_info->exHosts != NULL )
		self->execution_hosts
			= fsd_explode( (const char*const*)job_info->exHosts, ' ',
					job_info->numExHosts );
	self->last_update_time = time(NULL);
	if( self->state >= DRMAA_PS_DONE )
		fsd_cond_broadcast( &self->status_cond );
	fsd_log_return(( "" ));
}
예제 #12
0
파일: job.c 프로젝트: eliv/slurm-drmaa-1
static void
slurmdrmaa_job_control( fsd_job_t *self, int action )
{
	slurmdrmaa_job_t *slurm_self = (slurmdrmaa_job_t*)self;
	job_desc_msg_t job_desc;

	fsd_log_enter(( "({job_id=%s}, action=%d)", self->job_id, action ));

	fsd_mutex_lock( &self->session->drm_connection_mutex );
	TRY
	 {
		switch( action )
		 {
			case DRMAA_CONTROL_SUSPEND:
				if(slurm_suspend(fsd_atoi(self->job_id)) == -1) {
					fsd_exc_raise_fmt(	FSD_ERRNO_INTERNAL_ERROR,"slurm_suspend error: %s,job_id: %s",slurm_strerror(slurm_get_errno()),self->job_id);
				}
				slurm_self->user_suspended = true;
				break;
			case DRMAA_CONTROL_HOLD:
				/* change priority to 0*/
				slurm_init_job_desc_msg(&job_desc);
				slurm_self->old_priority = job_desc.priority;
				job_desc.job_id = atoi(self->job_id);
				job_desc.priority = 0;
				job_desc.alloc_sid = 0;
				if(slurm_update_job(&job_desc) == -1) {
					fsd_exc_raise_fmt(	FSD_ERRNO_INTERNAL_ERROR,"slurm_update_job error: %s,job_id: %s",slurm_strerror(slurm_get_errno()),self->job_id);
				}
				break;
			case DRMAA_CONTROL_RESUME:
				if(slurm_resume(fsd_atoi(self->job_id)) == -1) {
					fsd_exc_raise_fmt(	FSD_ERRNO_INTERNAL_ERROR,"slurm_resume error: %s,job_id: %s",slurm_strerror(slurm_get_errno()),self->job_id);
				}
				slurm_self->user_suspended = false;
				break;
			case DRMAA_CONTROL_RELEASE:
			  /* change priority back*/
			  	slurm_init_job_desc_msg(&job_desc);
				job_desc.priority = INFINITE;
				job_desc.job_id = atoi(self->job_id);
				if(slurm_update_job(&job_desc) == -1) {
					fsd_exc_raise_fmt(	FSD_ERRNO_INTERNAL_ERROR,"slurm_update_job error: %s,job_id: %s",slurm_strerror(slurm_get_errno()),self->job_id);
				}
				break;
			case DRMAA_CONTROL_TERMINATE:
				if(slurm_kill_job(fsd_atoi(self->job_id),SIGKILL,0) == -1) {
					fsd_exc_raise_fmt(	FSD_ERRNO_INTERNAL_ERROR,"slurm_terminate_job error: %s,job_id: %s",slurm_strerror(slurm_get_errno()),self->job_id);
				}
				break;
			default:
				fsd_exc_raise_fmt(
						FSD_ERRNO_INVALID_ARGUMENT,
						"job::control: unknown action %d", action );
		 }
					
		fsd_log_debug(("job::control: successful"));
	 }
	FINALLY
	 {
		fsd_mutex_unlock( &self->session->drm_connection_mutex );
	 }
	END_TRY

	fsd_log_return(( "" ));
}
예제 #13
0
파일: job.c 프로젝트: eliv/slurm-drmaa-1
void
slurmdrmaa_job_create(
		fsd_drmaa_session_t *session,
		const fsd_template_t *jt,
		fsd_environ_t **envp,
		fsd_expand_drmaa_ph_t *expand, 
		job_desc_msg_t * job_desc,
		int n_job
		)
{
	const char *input_path_orig = NULL;
	const char *output_path_orig = NULL;
	const char *error_path_orig = NULL;
	char *volatile input_path = NULL;
	char *volatile output_path = NULL;
	char *volatile error_path = NULL;
	bool input_host = false;
	bool output_host = false;
	bool error_host = false;
	bool join_files = false;
	const char *value;
	const char *const *vector;
	const char *job_category = "default";
	
	job_desc->user_id = getuid();
	job_desc->group_id = getgid();

	job_desc->env_size = 0;
	
	/* job name */
	value = jt->get_attr( jt, DRMAA_JOB_NAME );
	if( value )
	{
		job_desc->name = fsd_strdup(value);
		fsd_log_debug(("# job_name = %s",job_desc->name));
	}
	
	/* job state at submit */
	value = jt->get_attr( jt, DRMAA_JS_STATE );
	if( value )
	{
		if( 0 == strcmp( value, DRMAA_SUBMISSION_STATE_ACTIVE ) )
		{}
		else if( 0 == strcmp( value, DRMAA_SUBMISSION_STATE_HOLD ) )
		{
			job_desc->priority = 0;
			fsd_log_debug(("# hold = user"));
		}
		else
		{
			fsd_exc_raise_msg(FSD_DRMAA_ERRNO_INVALID_ATTRIBUTE_VALUE, "invalid value of drmaa_js_state attribute" );
		}
	}
	
	TRY
	{
		const char *command = NULL;
		char *command_expanded = NULL;
		char *temp_script_old = NULL;
		char *temp_script = "";
		const char *const *i;
		int j;

		/* remote command */
		command = jt->get_attr( jt, DRMAA_REMOTE_COMMAND );
		if( command == NULL )
			fsd_exc_raise_msg(
					FSD_DRMAA_ERRNO_CONFLICTING_ATTRIBUTE_VALUES,
					"drmaa_remote_command not set for job template"
					);

		command_expanded = expand->expand( expand, fsd_strdup(command), FSD_DRMAA_PH_HD | FSD_DRMAA_PH_WD );

		temp_script = fsd_asprintf("#!/bin/bash\n%s",command_expanded);
		fsd_free(command_expanded);

		/* arguments list */
		vector = jt->get_v_attr( jt, DRMAA_V_ARGV );

		if( vector )
	 	{
			for( i = vector, j = 2;  *i;  i++, j++ )
			{
				char *arg_expanded = expand->expand( expand, fsd_strdup(*i), FSD_DRMAA_PH_HD | FSD_DRMAA_PH_WD );
				
				temp_script_old = fsd_strdup(temp_script);
				
				if (strcmp(temp_script, "") != 0) {
					fsd_free(temp_script);
				}
				/* add too script */
				temp_script = fsd_asprintf("%s '%s'", temp_script_old, arg_expanded);
				fsd_free(temp_script_old);
				fsd_free(arg_expanded);
			}
		}
		
		job_desc->script = fsd_asprintf("%s\n", temp_script);
		fsd_log_debug(("# Script:\n%s", job_desc->script));
		fsd_free(temp_script);
	}
	END_TRY
	

	/* start time */
	value = jt->get_attr( jt, DRMAA_START_TIME );
	if( value )
 	{ 
		job_desc->begin_time = fsd_datetime_parse( value );
		fsd_log_debug(( "\n  drmaa_start_time: %s -> %ld", value, (long)job_desc->begin_time));
	}

	/*  propagate all environment variables from submission host */
	{
		extern char **environ;
		char **i;
		unsigned j = 0;

		for ( i = environ; *i; i++) {
			job_desc->env_size++;
		}
		
		fsd_log_debug(("environ env_size = %d",job_desc->env_size));
		fsd_calloc(job_desc->environment, job_desc->env_size+1, char *);
		
		for ( i = environ; *i; i++,j++ ) {
			job_desc->environment[j] = fsd_strdup(*i);
		}
	}

	/* environment */
	
	vector = jt->get_v_attr( jt, DRMAA_V_ENV );
	if( vector )
	{
		const char *const *i;
		unsigned j = 0;
		unsigned env_offset = job_desc->env_size;

		for( i = vector;  *i;  i++ )
 		{
			job_desc->env_size++;
		}
		fsd_log_debug(("jt env_size = %d",job_desc->env_size));

		fsd_log_debug(("# environment ="));
		fsd_realloc(job_desc->environment, job_desc->env_size+1, char *);

		for( i = vector;  *i;  i++,j++ )
 		{
			job_desc->environment[j + env_offset] = fsd_strdup(*i);
			fsd_log_debug((" %s", job_desc->environment[j+ env_offset]));
		}
	 }
	
 	/* wall clock time hard limit */
	value = jt->get_attr( jt, DRMAA_WCT_HLIMIT );
	if (value)
	{
		job_desc->time_limit = slurmdrmaa_datetime_parse( value );
		fsd_log_debug(("# wct_hlimit = %s -> %ld",value, (long int)slurmdrmaa_datetime_parse( value )));
	}

		
	/*expand->set(expand, FSD_DRMAA_PH_INCR,fsd_asprintf("%d", n_job));*/ /* set current value */
	/* TODO: test drmaa_ph_incr */
	/* job working directory */
	value = jt->get_attr( jt, DRMAA_WD );
	if( value )
	{
		char *cwd_expanded = expand->expand( expand, fsd_strdup(value), FSD_DRMAA_PH_HD | FSD_DRMAA_PH_INCR );

		expand->set( expand, FSD_DRMAA_PH_WD, fsd_strdup(cwd_expanded));

		fsd_log_debug(("# work_dir = %s",cwd_expanded));
		job_desc->work_dir = fsd_strdup(cwd_expanded);
		fsd_free(cwd_expanded);
	}
	else
	{
		char cwdbuf[4096] = "";

		if ((getcwd(cwdbuf, 4095)) == NULL) {
			char errbuf[256] = "InternalError";
			(void)strerror_r(errno, errbuf, 256); /*on error the default message would be returned */
			fsd_log_error(("getcwd failed: %s", errbuf));
			job_desc->work_dir = fsd_strdup(".");
		} else {
			job_desc->work_dir = fsd_strdup(cwdbuf);
		}

		fsd_log_debug(("work_dir(default:CWD) %s", job_desc->work_dir));
	}

	TRY
 	{
		/* input path */
		input_path_orig = jt->get_attr( jt, DRMAA_INPUT_PATH );
		if( input_path_orig )
		{
			input_path = internal_map_file( expand, input_path_orig, &input_host,"input" );
			fsd_log_debug(( "\n  drmaa_input_path: %s -> %s", input_path_orig, input_path ));
		}

		/* output path */
		output_path_orig = jt->get_attr( jt, DRMAA_OUTPUT_PATH );
		if( output_path_orig )
		{
			output_path = internal_map_file( expand, output_path_orig, &output_host,"output" );
			fsd_log_debug(( "\n  drmaa_output_path: %s -> %s", output_path_orig, output_path ));
		}

		/* error path */
		error_path_orig = jt->get_attr( jt, DRMAA_ERROR_PATH );
		if( error_path_orig )
		{
			error_path = internal_map_file( expand, error_path_orig, &error_host,"error" );
			fsd_log_debug(( "\n  drmaa_error_path: %s -> %s", error_path_orig, error_path ));
		}

		/* join files */
		value = jt->get_attr( jt, DRMAA_JOIN_FILES );
		if( value )
		{
			if( (value[0] == 'y' || value[0] == 'Y')  &&  value[1] == '\0' )
				join_files = true;
			else if( (value[0] == 'n' || value[0] == 'N')  &&  value[1] == '\0' )
				join_files = false;
			else
				fsd_exc_raise_msg(
						FSD_DRMAA_ERRNO_INVALID_ATTRIBUTE_VALUE,
						"invalid value of drmaa_join_files attribute" );
		}

		if( join_files )
		{
			if( output_path == NULL )
				fsd_exc_raise_msg(FSD_DRMAA_ERRNO_CONFLICTING_ATTRIBUTE_VALUES,	"drmaa_join_files is set and output file is not given" );
			if( error_path!=NULL && 0 != strcmp( output_path, error_path ) )
				fsd_log_warning(( "Error file was given but will be ignored since drmaa_join_files was set." ));

			if (error_path)
				fsd_free(error_path);

			 error_path = fsd_strdup(output_path);
		}
		else
		{
			if( error_path == NULL  &&  output_path )
				error_path = fsd_strdup( "/dev/null" );
			if( output_path == NULL  &&  error_path )
				output_path = fsd_strdup( "/dev/null" );
		}


		/* email addresses to send notifications */
		vector = jt->get_v_attr( jt, DRMAA_V_EMAIL );
		if( vector  &&  vector[0] )
		{
			/* only to one email address message may be send */
			job_desc->mail_user = fsd_strdup(vector[0]);
			job_desc->mail_type = MAIL_JOB_BEGIN | MAIL_JOB_END |  MAIL_JOB_FAIL;
			fsd_log_debug(("# mail_user = %s\n",vector[0]));
			fsd_log_debug(("# mail_type = %o\n",job_desc->mail_type));
			if( vector[1] != NULL )
			{
				fsd_log_error(( "SLURM only supports one e-mail notification address" ));
				fsd_exc_raise_msg(FSD_DRMAA_ERRNO_INVALID_ATTRIBUTE_VALUE,"SLURM only supports one e-mail notification address");
			}
		}

		/* block email */
		value = jt->get_attr( jt, DRMAA_BLOCK_EMAIL );
		if( value )
		{
			bool block;
			if( strcmp(value, "0") == 0 )
			{
				block = true;
				fsd_log_debug(("# block_email = true"));
				fsd_log_debug(("# mail_user delated"));
				fsd_free(job_desc->mail_user);
				job_desc->mail_user = NULL;
			}
			else if( strcmp(value, "1") == 0 )
				block = false;
			else
				fsd_exc_raise_msg(FSD_DRMAA_ERRNO_INVALID_ATTRIBUTE_VALUE,"invalid value of drmaa_block_email attribute" );

			if( block && output_path == NULL )
			{
				fsd_log_debug(( "output path not set and we want to block e-mail, set to /dev/null" ));
				output_path = fsd_strdup( "/dev/null" );
			}
		}

		if( input_path )
		{
			job_desc->std_in = fsd_strdup(input_path);
			fsd_log_debug(("# input = %s", input_path));
		}

		if( output_path )
		{
			job_desc->std_out = fsd_strdup(output_path);
			fsd_log_debug(("# output = %s", output_path));
		}

		if( error_path )
		{
			job_desc->std_err = fsd_strdup(error_path);
			fsd_log_debug(("# error = %s", error_path));
		}
	 }
	FINALLY
	{
		fsd_free( input_path );
		fsd_free( output_path );
		fsd_free( error_path );
		input_path = NULL;
		output_path = NULL;
		error_path = NULL;
	}
	END_TRY			
	
	
	/* job category */
	value = jt->get_attr( jt, DRMAA_JOB_CATEGORY );
	if( value )
		job_category = value;

	{
		fsd_conf_option_t *category_value = NULL;
		category_value = fsd_conf_dict_get( session->job_categories, job_category );

		if( category_value != NULL )
	 	{
			if( category_value->type != FSD_CONF_STRING )
				fsd_exc_raise_fmt(
						FSD_ERRNO_INTERNAL_ERROR,
						"configuration error: job category should be string"
						);

			fsd_log_debug(("# Job category %s : %s\n",value,category_value->val.string));			
			slurmdrmaa_parse_native(job_desc,category_value->val.string);			
	 	}
		else
	 	{
			if( value != NULL )
				fsd_exc_raise_fmt(
						FSD_DRMAA_ERRNO_INVALID_ATTRIBUTE_VALUE,
						"invalid job category: %s", job_category
						);
	 	}
 	}

    /* set defaults for constraints - ref: slurm.h */
    fsd_log_debug(("# Setting defaults for tasks and processors" ));
    job_desc->num_tasks = 1;
    job_desc->min_cpus = 0;
    job_desc->cpus_per_task = 0;
    job_desc->pn_min_cpus = 0;

	/* native specification */
	value = jt->get_attr( jt, DRMAA_NATIVE_SPECIFICATION );
	if( value )
	{
		fsd_log_debug(("# Native specification: %s\n", value));
		slurmdrmaa_parse_native(job_desc, value);
	}
	
}
예제 #14
0
파일: job.c 프로젝트: eliv/slurm-drmaa-1
static void
slurmdrmaa_job_update_status( fsd_job_t *self )
{
	job_info_msg_t *job_info = NULL;
	slurmdrmaa_job_t * slurm_self = (slurmdrmaa_job_t *) self;
	fsd_log_enter(( "({job_id=%s})", self->job_id ));

	fsd_mutex_lock( &self->session->drm_connection_mutex );
	TRY
	{
		if ( slurm_load_job( &job_info, fsd_atoi(self->job_id), SHOW_ALL) ) {
			int _slurm_errno = slurm_get_errno();

			if (_slurm_errno == ESLURM_INVALID_JOB_ID) {
				self->on_missing(self);
			} else {
				fsd_exc_raise_fmt(FSD_ERRNO_INTERNAL_ERROR,"slurm_load_jobs error: %s,job_id: %s", slurm_strerror(slurm_get_errno()), self->job_id);
			}
		}
		if (job_info) {
			fsd_log_debug(("state = %d, state_reason = %d", job_info->job_array[0].job_state, job_info->job_array[0].state_reason));
			
			switch(job_info->job_array[0].job_state & JOB_STATE_BASE)
			{

				case JOB_PENDING:
					switch(job_info->job_array[0].state_reason)
					{
						#if SLURM_VERSION_NUMBER >= SLURM_VERSION_NUM(2,2,0)
						case WAIT_HELD_USER:   /* job is held by user */
							fsd_log_debug(("interpreting as DRMAA_PS_USER_ON_HOLD"));
							self->state = DRMAA_PS_USER_ON_HOLD;
							break;
						#endif
						case WAIT_HELD:  /* job is held by administrator */
							fsd_log_debug(("interpreting as DRMAA_PS_SYSTEM_ON_HOLD"));
							self->state = DRMAA_PS_SYSTEM_ON_HOLD;
							break;
						default:
							fsd_log_debug(("interpreting as DRMAA_PS_QUEUED_ACTIVE"));
							self->state = DRMAA_PS_QUEUED_ACTIVE;
					}
					break;
				case JOB_RUNNING:
					fsd_log_debug(("interpreting as DRMAA_PS_RUNNING"));
					self->state = DRMAA_PS_RUNNING;
					break;
				case JOB_SUSPENDED:
					if(slurm_self->user_suspended == true) {
						fsd_log_debug(("interpreting as DRMAA_PS_USER_SUSPENDED"));
						self->state = DRMAA_PS_USER_SUSPENDED;
					} else {
						fsd_log_debug(("interpreting as DRMAA_PS_SYSTEM_SUSPENDED"));
						self->state = DRMAA_PS_SYSTEM_SUSPENDED;
					}
					break;
				case JOB_COMPLETE:
					fsd_log_debug(("interpreting as DRMAA_PS_DONE"));
					self->state = DRMAA_PS_DONE;
					self->exit_status = job_info->job_array[0].exit_code;
					fsd_log_debug(("exit_status = %d -> %d",self->exit_status, WEXITSTATUS(self->exit_status)));
					break;
				case JOB_CANCELLED:
					fsd_log_debug(("interpreting as DRMAA_PS_FAILED (aborted)"));
					self->state = DRMAA_PS_FAILED;
					self->exit_status = -1;
				case JOB_FAILED:
				case JOB_TIMEOUT:
				case JOB_NODE_FAIL:
				#if SLURM_VERSION_NUMBER >= SLURM_VERSION_NUM(2,3,0)
				case JOB_PREEMPTED:
				#endif
					fsd_log_debug(("interpreting as DRMAA_PS_FAILED"));
					self->state = DRMAA_PS_FAILED;
					self->exit_status = job_info->job_array[0].exit_code;
					fsd_log_debug(("exit_status = %d -> %d",self->exit_status, WEXITSTATUS(self->exit_status)));
					break;
				default: /*unknown state */
					fsd_log_error(("Unknown job state: %d. Please send bug report: http://apps.man.poznan.pl/trac/slurm-drmaa", job_info->job_array[0].job_state));
			}

			if (job_info->job_array[0].job_state & JOB_STATE_FLAGS & JOB_COMPLETING) {
				fsd_log_debug(("Epilog completing"));
			}

			if (job_info->job_array[0].job_state & JOB_STATE_FLAGS & JOB_CONFIGURING) {
				fsd_log_debug(("Nodes booting"));
			}

			if (self->exit_status == -1) /* input,output,error path failure etc*/
				self->state = DRMAA_PS_FAILED;

			self->last_update_time = time(NULL);
		
			if( self->state >= DRMAA_PS_DONE ) {
				fsd_log_debug(("exit_status = %d, WEXITSTATUS(exit_status) = %d", self->exit_status, WEXITSTATUS(self->exit_status)));
				fsd_cond_broadcast( &self->status_cond );
			}
		}
	}
	FINALLY
	{
		if(job_info != NULL)
			slurm_free_job_info_msg (job_info);

		fsd_mutex_unlock( &self->session->drm_connection_mutex );
	}
	END_TRY
	
	fsd_log_return(( "" ));
}
예제 #15
0
void
fsd_drmaa_session_wait_for_single_job(
		fsd_drmaa_session_t *self,
		const char *job_id, const struct timespec *timeout,
		int *status, fsd_iter_t **rusage,
		bool dispose
		)
{
	fsd_job_t *volatile job = NULL;
	volatile bool locked = false;

	fsd_log_enter(( "(%s)", job_id ));
	TRY
	 {
		job = self->get_job( self, job_id );
		if( job == NULL )
			fsd_exc_raise_fmt( FSD_DRMAA_ERRNO_INVALID_JOB,
					"Job '%s' not found in DRMS queue", job_id );
		job->update_status( job );
		while( !self->destroy_requested  &&  job->state < DRMAA_PS_DONE )
		 {
			bool signaled = true;
			fsd_log_debug(( "fsd_drmaa_session_wait_for_single_job: "
						"waiting for %s to terminate", job_id ));
			if( self->enable_wait_thread )
			 {
				if( timeout )
					signaled = fsd_cond_timedwait(
							&job->status_cond, &job->mutex, timeout );
				else
				 {
					fsd_cond_wait( &job->status_cond, &job->mutex );
				 }
				if( !signaled )
					fsd_exc_raise_code( FSD_DRMAA_ERRNO_EXIT_TIMEOUT );
			 }
			else
			 {
				self->wait_for_job_status_change(
						self, &job->status_cond, &job->mutex, timeout );
			 }

			fsd_log_debug(( "fsd_drmaa_session_wait_for_single_job: woken up" ));
			if( !self->enable_wait_thread )
				job->update_status( job );
		 }

		if( self->destroy_requested )
			fsd_exc_raise_code( FSD_DRMAA_ERRNO_EXIT_TIMEOUT );

		job->get_termination_status( job, status, rusage );
		if( dispose )
		 {
			job->release( job ); /*release mutex in order to ensure proper order of locking: first job_set mutex then job mutex */

			locked = fsd_mutex_lock( &self->mutex );

			job = self->get_job( self, job_id );
			if (job != NULL)
			 {
				self->jobs->remove( self->jobs, job );
				job->flags |= FSD_JOB_DISPOSED;
			 }
			else
			 {
				fsd_log_error(("Some other thread has already reaped job %s", job_id ));
			 }

			locked = fsd_mutex_unlock( &self->mutex );
		 }
	 }
	FINALLY
	 {
		if ( job )
			job->release( job );
		if ( locked )
			fsd_mutex_unlock( &self->mutex );
	 }
	END_TRY
	fsd_log_return((""));
}
예제 #16
0
fsd_drmaa_session_t *
fsd_drmaa_session_new( const char *contact )
{
	fsd_drmaa_session_t *volatile self = NULL;

	fsd_log_enter(( "(%s)", contact ));
	TRY
	 {
		fsd_malloc( self, fsd_drmaa_session_t );

		self->release = fsd_drmaa_session_release;
		self->destroy = fsd_drmaa_session_destroy;
		self->destroy_nowait = fsd_drmaa_session_destroy_nowait;
		self->run_job = fsd_drmaa_session_run_job;
		self->run_bulk = fsd_drmaa_session_run_bulk;
		self->control_job = fsd_drmaa_session_control_job;
		self->job_ps = fsd_drmaa_session_job_ps;
		self->synchronize = fsd_drmaa_session_synchronize;
		self->wait = fsd_drmaa_session_wait;
		self->new_job = fsd_drmaa_session_new_job;
		self->run_impl = fsd_drmaa_session_run_impl;
		self->wait_for_single_job = fsd_drmaa_session_wait_for_single_job;
		self->wait_for_any_job = fsd_drmaa_session_wait_for_any_job;
		self->wait_for_job_status_change =
			fsd_drmaa_session_wait_for_job_status_change;
		self->wait_thread = fsd_drmaa_session_wait_thread;
		self->stop_wait_thread = fsd_drmaa_session_stop_wait_thread;
		self->update_all_jobs_status = fsd_drmaa_session_update_all_jobs_status;
		self->get_submited_job_ids = fsd_drmaa_session_get_submited_job_ids;
		self->get_job = fsd_drmaa_session_get_job;
		self->load_configuration = fsd_drmaa_session_load_configuration;
		self->read_configuration = fsd_drmaa_session_read_configuration;
		self->apply_configuration = fsd_drmaa_session_apply_configuration;

		self->ref_cnt = 1;
		self->destroy_requested = false;
		self->contact = NULL;
		self->jobs = NULL;
		self->configuration = NULL;
		self->pool_delay.tv_sec = 5;
		self->pool_delay.tv_nsec = 0;
		self->cache_job_state = 0;
		self->enable_wait_thread = true;
		self->job_categories = NULL;
		self->missing_jobs = FSD_REVEAL_MISSING_JOBS;
		self->wait_thread_started = false;
		self->wait_thread_run_flag = false;

		fsd_mutex_init( &self->mutex );
		fsd_cond_init( &self->wait_condition );
		fsd_cond_init( &self->destroy_condition );
		fsd_mutex_init( &self->drm_connection_mutex );
		self->jobs = fsd_job_set_new();
		self->contact = fsd_strdup( contact );
	 }
	EXCEPT_DEFAULT
	 {
		if( self != NULL )
			self->destroy( self );
		fsd_exc_reraise();
	 }
	END_TRY

	fsd_log_debug(("sizeof(fsd_drmaa_session_t)=%d", sizeof(fsd_drmaa_session_t)));
	return self;
}
예제 #17
0
char *
fsd_drmaa_session_wait_for_any_job(
		fsd_drmaa_session_t *self,
		const struct timespec *timeout,
		int *status, fsd_iter_t **rusage,
		bool dispose
		)
{
	fsd_job_set_t *set = self->jobs;
	fsd_job_t *volatile job = NULL;
	char *volatile job_id = NULL;
	volatile bool locked = false;

	fsd_log_enter(( "" ));

	TRY
	 {
		while( job == NULL )
		 {
			bool signaled = true;

			if( self->destroy_requested )
				fsd_exc_raise_code( FSD_DRMAA_ERRNO_NO_ACTIVE_SESSION );

			if( !self->enable_wait_thread )
				self->update_all_jobs_status( self );

			locked = fsd_mutex_lock( &self->mutex );
			if( set->empty( set ) )
				fsd_exc_raise_msg( FSD_DRMAA_ERRNO_INVALID_JOB,
						"No job found to be waited for" );

			if( (job = set->find_terminated( set )) != NULL )
				break;

			if( self->destroy_requested )
				fsd_exc_raise_code( FSD_DRMAA_ERRNO_NO_ACTIVE_SESSION );
			if( self->enable_wait_thread )
			 {
				fsd_log_debug(( "wait_for_any_job: waiting for wait thread" ));
				if( timeout )
					signaled = fsd_cond_timedwait(
							&self->wait_condition, &self->mutex, timeout );
				else
					fsd_cond_wait( &self->wait_condition, &self->mutex );
			 }
			else
			 {
				fsd_log_debug(( "wait_for_any_job: waiting for next check" ));
				self->wait_for_job_status_change( self,
						&self->wait_condition, &self->mutex, timeout );
			 }
			locked = fsd_mutex_unlock( &self->mutex );
			fsd_log_debug((
						"wait_for_any_job: woken up; signaled=%d", signaled ));

			if( !signaled )
				fsd_exc_raise_code( FSD_DRMAA_ERRNO_EXIT_TIMEOUT );

		 }
		fsd_log_debug(( "wait_for_any_job: waiting finished" ));

		job_id = fsd_strdup( job->job_id );
		job->get_termination_status( job, status, rusage );
	 }
	EXCEPT_DEFAULT
	 {
		if( job_id )
			fsd_free( job_id );
		fsd_exc_reraise();
	 }
	FINALLY
	 {
		if( job )
		 {
			if( fsd_exc_get() == NULL  &&  dispose )
			 {
				set->remove( set, job );
				job->flags |= FSD_JOB_DISPOSED;
			 }
			job->release( job );
		 }
		if( locked )
			fsd_mutex_unlock( &self->mutex );
	 }
	END_TRY

	fsd_log_return(( " =%s", job_id ));
	return job_id;
}