Exemple #1
0
static const char *
slurmdrmaa_get_DRM_system( fsd_drmaa_singletone_t *self )
{
	if(slurmdrmaa_version[0] == '\0') /*no locks as drmaa_get_drm_system is usually called only once */
	{
		slurm_ctl_conf_t * conf_info_msg_ptr = NULL; 
		if ( slurm_load_ctl_conf ((time_t) NULL, &conf_info_msg_ptr ) == -1 ) 
		{ 
			fsd_log_error(("slurm_load_ctl_conf error: %s",slurm_strerror(slurm_get_errno())));
			fsd_snprintf(NULL, slurmdrmaa_version, sizeof(slurmdrmaa_version)-1,"SLURM");
		}
		else
		{
			fsd_snprintf(NULL, slurmdrmaa_version, sizeof(slurmdrmaa_version)-1,"SLURM %s", conf_info_msg_ptr->version);
			slurm_free_ctl_conf (conf_info_msg_ptr);
		}
	}
	return slurmdrmaa_version;
}
void *
fsd_drmaa_session_wait_thread( fsd_drmaa_session_t *self )
{
	struct timespec ts, *next_check = &ts;
        bool volatile locked = false;

	fsd_log_enter(( "" ));
	locked = fsd_mutex_lock( &self->mutex );
	TRY
	 {
		while( self->wait_thread_run_flag )
			TRY
			 {
				fsd_log_debug(( "wait thread: next iteration" ));
				self->update_all_jobs_status( self );
				fsd_cond_broadcast( &self->wait_condition );
				
				fsd_get_time( next_check );
				fsd_ts_add( next_check, &self->pool_delay );
				fsd_cond_timedwait( &self->wait_condition, &self->mutex, next_check );
				
			 }
			EXCEPT_DEFAULT
			 {
				const fsd_exc_t *e = fsd_exc_get();
				fsd_log_error(( "wait thread: <%d:%s>", e->code(e), e->message(e) ));
			 }
			END_TRY
	 }
	FINALLY
	 { 
		if (locked)
			fsd_mutex_unlock( &self->mutex ); 
	 }
	END_TRY

	fsd_log_return(( " =NULL" ));
	return NULL;
}
void 
fsd_log_check_verbosity( void )
{
	const char *log_level_str = getenv("DRMAA_LOG_LEVEL");

	if (log_level_str == NULL) 
	 {
		return;
	 } 
	else if (strcmp(log_level_str, "TRACE") == 0) 
	 {
		fsd_verbose_level = FSD_LOG_TRACE;
	 } 
	else if (strcmp(log_level_str, "DEBUG") == 0) 
	 {
		fsd_verbose_level = FSD_LOG_DEBUG;
	 } 
	else if (strcmp(log_level_str, "INFO") == 0) 
	 {
		fsd_verbose_level = FSD_LOG_INFO;
	 } 
	else if (strcmp(log_level_str, "WARNING") == 0) 
	 {
		fsd_verbose_level = FSD_LOG_WARNING;
	 } 
	else if (strcmp(log_level_str, "ERROR") == 0) 
	 {
		fsd_verbose_level = FSD_LOG_ERROR;
	 } 
	else if (strcmp(log_level_str, "FATAL") == 0) 
	 {
		fsd_verbose_level = FSD_LOG_FATAL;
	 } 
	else 
	 {
		fsd_log_error(( "Illegal value of DRMAA_LOG_LEVEL=%s. Using default logging verbosity.", log_level_str));
	 } 
}
Exemple #4
0
void
slurmdrmaa_job_create(
		fsd_drmaa_session_t *session,
		const fsd_template_t *jt,
		fsd_environ_t **envp,
		fsd_expand_drmaa_ph_t *expand, 
		job_desc_msg_t * job_desc,
		int n_job
		)
{
	const char *input_path_orig = NULL;
	const char *output_path_orig = NULL;
	const char *error_path_orig = NULL;
	char *volatile input_path = NULL;
	char *volatile output_path = NULL;
	char *volatile error_path = NULL;
	bool input_host = false;
	bool output_host = false;
	bool error_host = false;
	bool join_files = false;
	const char *value;
	const char *const *vector;
	const char *job_category = "default";
	
	job_desc->user_id = getuid();
	job_desc->group_id = getgid();

	job_desc->env_size = 0;
	
	/* job name */
	value = jt->get_attr( jt, DRMAA_JOB_NAME );
	if( value )
	{
		job_desc->name = fsd_strdup(value);
		fsd_log_debug(("# job_name = %s",job_desc->name));
	}
	
	/* job state at submit */
	value = jt->get_attr( jt, DRMAA_JS_STATE );
	if( value )
	{
		if( 0 == strcmp( value, DRMAA_SUBMISSION_STATE_ACTIVE ) )
		{}
		else if( 0 == strcmp( value, DRMAA_SUBMISSION_STATE_HOLD ) )
		{
			job_desc->priority = 0;
			fsd_log_debug(("# hold = user"));
		}
		else
		{
			fsd_exc_raise_msg(FSD_DRMAA_ERRNO_INVALID_ATTRIBUTE_VALUE, "invalid value of drmaa_js_state attribute" );
		}
	}
	
	TRY
	{
		const char *command = NULL;
		char *command_expanded = NULL;
		char *temp_script_old = NULL;
		char *temp_script = "";
		const char *const *i;
		int j;

		/* remote command */
		command = jt->get_attr( jt, DRMAA_REMOTE_COMMAND );
		if( command == NULL )
			fsd_exc_raise_msg(
					FSD_DRMAA_ERRNO_CONFLICTING_ATTRIBUTE_VALUES,
					"drmaa_remote_command not set for job template"
					);

		command_expanded = expand->expand( expand, fsd_strdup(command), FSD_DRMAA_PH_HD | FSD_DRMAA_PH_WD );

		temp_script = fsd_asprintf("#!/bin/bash\n%s",command_expanded);
		fsd_free(command_expanded);

		/* arguments list */
		vector = jt->get_v_attr( jt, DRMAA_V_ARGV );

		if( vector )
	 	{
			for( i = vector, j = 2;  *i;  i++, j++ )
			{
				char *arg_expanded = expand->expand( expand, fsd_strdup(*i), FSD_DRMAA_PH_HD | FSD_DRMAA_PH_WD );
				
				temp_script_old = fsd_strdup(temp_script);
				
				if (strcmp(temp_script, "") != 0) {
					fsd_free(temp_script);
				}
				/* add too script */
				temp_script = fsd_asprintf("%s '%s'", temp_script_old, arg_expanded);
				fsd_free(temp_script_old);
				fsd_free(arg_expanded);
			}
		}
		
		job_desc->script = fsd_asprintf("%s\n", temp_script);
		fsd_log_debug(("# Script:\n%s", job_desc->script));
		fsd_free(temp_script);
	}
	END_TRY
	

	/* start time */
	value = jt->get_attr( jt, DRMAA_START_TIME );
	if( value )
 	{ 
		job_desc->begin_time = fsd_datetime_parse( value );
		fsd_log_debug(( "\n  drmaa_start_time: %s -> %ld", value, (long)job_desc->begin_time));
	}

	/*  propagate all environment variables from submission host */
	{
		extern char **environ;
		char **i;
		unsigned j = 0;

		for ( i = environ; *i; i++) {
			job_desc->env_size++;
		}
		
		fsd_log_debug(("environ env_size = %d",job_desc->env_size));
		fsd_calloc(job_desc->environment, job_desc->env_size+1, char *);
		
		for ( i = environ; *i; i++,j++ ) {
			job_desc->environment[j] = fsd_strdup(*i);
		}
	}

	/* environment */
	
	vector = jt->get_v_attr( jt, DRMAA_V_ENV );
	if( vector )
	{
		const char *const *i;
		unsigned j = 0;
		unsigned env_offset = job_desc->env_size;

		for( i = vector;  *i;  i++ )
 		{
			job_desc->env_size++;
		}
		fsd_log_debug(("jt env_size = %d",job_desc->env_size));

		fsd_log_debug(("# environment ="));
		fsd_realloc(job_desc->environment, job_desc->env_size+1, char *);

		for( i = vector;  *i;  i++,j++ )
 		{
			job_desc->environment[j + env_offset] = fsd_strdup(*i);
			fsd_log_debug((" %s", job_desc->environment[j+ env_offset]));
		}
	 }
	
 	/* wall clock time hard limit */
	value = jt->get_attr( jt, DRMAA_WCT_HLIMIT );
	if (value)
	{
		job_desc->time_limit = slurmdrmaa_datetime_parse( value );
		fsd_log_debug(("# wct_hlimit = %s -> %ld",value, (long int)slurmdrmaa_datetime_parse( value )));
	}

		
	/*expand->set(expand, FSD_DRMAA_PH_INCR,fsd_asprintf("%d", n_job));*/ /* set current value */
	/* TODO: test drmaa_ph_incr */
	/* job working directory */
	value = jt->get_attr( jt, DRMAA_WD );
	if( value )
	{
		char *cwd_expanded = expand->expand( expand, fsd_strdup(value), FSD_DRMAA_PH_HD | FSD_DRMAA_PH_INCR );

		expand->set( expand, FSD_DRMAA_PH_WD, fsd_strdup(cwd_expanded));

		fsd_log_debug(("# work_dir = %s",cwd_expanded));
		job_desc->work_dir = fsd_strdup(cwd_expanded);
		fsd_free(cwd_expanded);
	}
	else
	{
		char cwdbuf[4096] = "";

		if ((getcwd(cwdbuf, 4095)) == NULL) {
			char errbuf[256] = "InternalError";
			(void)strerror_r(errno, errbuf, 256); /*on error the default message would be returned */
			fsd_log_error(("getcwd failed: %s", errbuf));
			job_desc->work_dir = fsd_strdup(".");
		} else {
			job_desc->work_dir = fsd_strdup(cwdbuf);
		}

		fsd_log_debug(("work_dir(default:CWD) %s", job_desc->work_dir));
	}

	TRY
 	{
		/* input path */
		input_path_orig = jt->get_attr( jt, DRMAA_INPUT_PATH );
		if( input_path_orig )
		{
			input_path = internal_map_file( expand, input_path_orig, &input_host,"input" );
			fsd_log_debug(( "\n  drmaa_input_path: %s -> %s", input_path_orig, input_path ));
		}

		/* output path */
		output_path_orig = jt->get_attr( jt, DRMAA_OUTPUT_PATH );
		if( output_path_orig )
		{
			output_path = internal_map_file( expand, output_path_orig, &output_host,"output" );
			fsd_log_debug(( "\n  drmaa_output_path: %s -> %s", output_path_orig, output_path ));
		}

		/* error path */
		error_path_orig = jt->get_attr( jt, DRMAA_ERROR_PATH );
		if( error_path_orig )
		{
			error_path = internal_map_file( expand, error_path_orig, &error_host,"error" );
			fsd_log_debug(( "\n  drmaa_error_path: %s -> %s", error_path_orig, error_path ));
		}

		/* join files */
		value = jt->get_attr( jt, DRMAA_JOIN_FILES );
		if( value )
		{
			if( (value[0] == 'y' || value[0] == 'Y')  &&  value[1] == '\0' )
				join_files = true;
			else if( (value[0] == 'n' || value[0] == 'N')  &&  value[1] == '\0' )
				join_files = false;
			else
				fsd_exc_raise_msg(
						FSD_DRMAA_ERRNO_INVALID_ATTRIBUTE_VALUE,
						"invalid value of drmaa_join_files attribute" );
		}

		if( join_files )
		{
			if( output_path == NULL )
				fsd_exc_raise_msg(FSD_DRMAA_ERRNO_CONFLICTING_ATTRIBUTE_VALUES,	"drmaa_join_files is set and output file is not given" );
			if( error_path!=NULL && 0 != strcmp( output_path, error_path ) )
				fsd_log_warning(( "Error file was given but will be ignored since drmaa_join_files was set." ));

			if (error_path)
				fsd_free(error_path);

			 error_path = fsd_strdup(output_path);
		}
		else
		{
			if( error_path == NULL  &&  output_path )
				error_path = fsd_strdup( "/dev/null" );
			if( output_path == NULL  &&  error_path )
				output_path = fsd_strdup( "/dev/null" );
		}


		/* email addresses to send notifications */
		vector = jt->get_v_attr( jt, DRMAA_V_EMAIL );
		if( vector  &&  vector[0] )
		{
			/* only to one email address message may be send */
			job_desc->mail_user = fsd_strdup(vector[0]);
			job_desc->mail_type = MAIL_JOB_BEGIN | MAIL_JOB_END |  MAIL_JOB_FAIL;
			fsd_log_debug(("# mail_user = %s\n",vector[0]));
			fsd_log_debug(("# mail_type = %o\n",job_desc->mail_type));
			if( vector[1] != NULL )
			{
				fsd_log_error(( "SLURM only supports one e-mail notification address" ));
				fsd_exc_raise_msg(FSD_DRMAA_ERRNO_INVALID_ATTRIBUTE_VALUE,"SLURM only supports one e-mail notification address");
			}
		}

		/* block email */
		value = jt->get_attr( jt, DRMAA_BLOCK_EMAIL );
		if( value )
		{
			bool block;
			if( strcmp(value, "0") == 0 )
			{
				block = true;
				fsd_log_debug(("# block_email = true"));
				fsd_log_debug(("# mail_user delated"));
				fsd_free(job_desc->mail_user);
				job_desc->mail_user = NULL;
			}
			else if( strcmp(value, "1") == 0 )
				block = false;
			else
				fsd_exc_raise_msg(FSD_DRMAA_ERRNO_INVALID_ATTRIBUTE_VALUE,"invalid value of drmaa_block_email attribute" );

			if( block && output_path == NULL )
			{
				fsd_log_debug(( "output path not set and we want to block e-mail, set to /dev/null" ));
				output_path = fsd_strdup( "/dev/null" );
			}
		}

		if( input_path )
		{
			job_desc->std_in = fsd_strdup(input_path);
			fsd_log_debug(("# input = %s", input_path));
		}

		if( output_path )
		{
			job_desc->std_out = fsd_strdup(output_path);
			fsd_log_debug(("# output = %s", output_path));
		}

		if( error_path )
		{
			job_desc->std_err = fsd_strdup(error_path);
			fsd_log_debug(("# error = %s", error_path));
		}
	 }
	FINALLY
	{
		fsd_free( input_path );
		fsd_free( output_path );
		fsd_free( error_path );
		input_path = NULL;
		output_path = NULL;
		error_path = NULL;
	}
	END_TRY			
	
	
	/* job category */
	value = jt->get_attr( jt, DRMAA_JOB_CATEGORY );
	if( value )
		job_category = value;

	{
		fsd_conf_option_t *category_value = NULL;
		category_value = fsd_conf_dict_get( session->job_categories, job_category );

		if( category_value != NULL )
	 	{
			if( category_value->type != FSD_CONF_STRING )
				fsd_exc_raise_fmt(
						FSD_ERRNO_INTERNAL_ERROR,
						"configuration error: job category should be string"
						);

			fsd_log_debug(("# Job category %s : %s\n",value,category_value->val.string));			
			slurmdrmaa_parse_native(job_desc,category_value->val.string);			
	 	}
		else
	 	{
			if( value != NULL )
				fsd_exc_raise_fmt(
						FSD_DRMAA_ERRNO_INVALID_ATTRIBUTE_VALUE,
						"invalid job category: %s", job_category
						);
	 	}
 	}

    /* set defaults for constraints - ref: slurm.h */
    fsd_log_debug(("# Setting defaults for tasks and processors" ));
    job_desc->num_tasks = 1;
    job_desc->min_cpus = 0;
    job_desc->cpus_per_task = 0;
    job_desc->pn_min_cpus = 0;

	/* native specification */
	value = jt->get_attr( jt, DRMAA_NATIVE_SPECIFICATION );
	if( value )
	{
		fsd_log_debug(("# Native specification: %s\n", value));
		slurmdrmaa_parse_native(job_desc, value);
	}
	
}
Exemple #5
0
static void
slurmdrmaa_job_update_status( fsd_job_t *self )
{
	job_info_msg_t *job_info = NULL;
	slurmdrmaa_job_t * slurm_self = (slurmdrmaa_job_t *) self;
	fsd_log_enter(( "({job_id=%s})", self->job_id ));

	fsd_mutex_lock( &self->session->drm_connection_mutex );
	TRY
	{
		if ( slurm_load_job( &job_info, fsd_atoi(self->job_id), SHOW_ALL) ) {
			int _slurm_errno = slurm_get_errno();

			if (_slurm_errno == ESLURM_INVALID_JOB_ID) {
				self->on_missing(self);
			} else {
				fsd_exc_raise_fmt(FSD_ERRNO_INTERNAL_ERROR,"slurm_load_jobs error: %s,job_id: %s", slurm_strerror(slurm_get_errno()), self->job_id);
			}
		}
		if (job_info) {
			fsd_log_debug(("state = %d, state_reason = %d", job_info->job_array[0].job_state, job_info->job_array[0].state_reason));
			
			switch(job_info->job_array[0].job_state & JOB_STATE_BASE)
			{

				case JOB_PENDING:
					switch(job_info->job_array[0].state_reason)
					{
						#if SLURM_VERSION_NUMBER >= SLURM_VERSION_NUM(2,2,0)
						case WAIT_HELD_USER:   /* job is held by user */
							fsd_log_debug(("interpreting as DRMAA_PS_USER_ON_HOLD"));
							self->state = DRMAA_PS_USER_ON_HOLD;
							break;
						#endif
						case WAIT_HELD:  /* job is held by administrator */
							fsd_log_debug(("interpreting as DRMAA_PS_SYSTEM_ON_HOLD"));
							self->state = DRMAA_PS_SYSTEM_ON_HOLD;
							break;
						default:
							fsd_log_debug(("interpreting as DRMAA_PS_QUEUED_ACTIVE"));
							self->state = DRMAA_PS_QUEUED_ACTIVE;
					}
					break;
				case JOB_RUNNING:
					fsd_log_debug(("interpreting as DRMAA_PS_RUNNING"));
					self->state = DRMAA_PS_RUNNING;
					break;
				case JOB_SUSPENDED:
					if(slurm_self->user_suspended == true) {
						fsd_log_debug(("interpreting as DRMAA_PS_USER_SUSPENDED"));
						self->state = DRMAA_PS_USER_SUSPENDED;
					} else {
						fsd_log_debug(("interpreting as DRMAA_PS_SYSTEM_SUSPENDED"));
						self->state = DRMAA_PS_SYSTEM_SUSPENDED;
					}
					break;
				case JOB_COMPLETE:
					fsd_log_debug(("interpreting as DRMAA_PS_DONE"));
					self->state = DRMAA_PS_DONE;
					self->exit_status = job_info->job_array[0].exit_code;
					fsd_log_debug(("exit_status = %d -> %d",self->exit_status, WEXITSTATUS(self->exit_status)));
					break;
				case JOB_CANCELLED:
					fsd_log_debug(("interpreting as DRMAA_PS_FAILED (aborted)"));
					self->state = DRMAA_PS_FAILED;
					self->exit_status = -1;
				case JOB_FAILED:
				case JOB_TIMEOUT:
				case JOB_NODE_FAIL:
				#if SLURM_VERSION_NUMBER >= SLURM_VERSION_NUM(2,3,0)
				case JOB_PREEMPTED:
				#endif
					fsd_log_debug(("interpreting as DRMAA_PS_FAILED"));
					self->state = DRMAA_PS_FAILED;
					self->exit_status = job_info->job_array[0].exit_code;
					fsd_log_debug(("exit_status = %d -> %d",self->exit_status, WEXITSTATUS(self->exit_status)));
					break;
				default: /*unknown state */
					fsd_log_error(("Unknown job state: %d. Please send bug report: http://apps.man.poznan.pl/trac/slurm-drmaa", job_info->job_array[0].job_state));
			}

			if (job_info->job_array[0].job_state & JOB_STATE_FLAGS & JOB_COMPLETING) {
				fsd_log_debug(("Epilog completing"));
			}

			if (job_info->job_array[0].job_state & JOB_STATE_FLAGS & JOB_CONFIGURING) {
				fsd_log_debug(("Nodes booting"));
			}

			if (self->exit_status == -1) /* input,output,error path failure etc*/
				self->state = DRMAA_PS_FAILED;

			self->last_update_time = time(NULL);
		
			if( self->state >= DRMAA_PS_DONE ) {
				fsd_log_debug(("exit_status = %d, WEXITSTATUS(exit_status) = %d", self->exit_status, WEXITSTATUS(self->exit_status)));
				fsd_cond_broadcast( &self->status_cond );
			}
		}
	}
	FINALLY
	{
		if(job_info != NULL)
			slurm_free_job_info_msg (job_info);

		fsd_mutex_unlock( &self->session->drm_connection_mutex );
	}
	END_TRY
	
	fsd_log_return(( "" ));
}
void
fsd_drmaa_session_wait_for_single_job(
		fsd_drmaa_session_t *self,
		const char *job_id, const struct timespec *timeout,
		int *status, fsd_iter_t **rusage,
		bool dispose
		)
{
	fsd_job_t *volatile job = NULL;
	volatile bool locked = false;

	fsd_log_enter(( "(%s)", job_id ));
	TRY
	 {
		job = self->get_job( self, job_id );
		if( job == NULL )
			fsd_exc_raise_fmt( FSD_DRMAA_ERRNO_INVALID_JOB,
					"Job '%s' not found in DRMS queue", job_id );
		job->update_status( job );
		while( !self->destroy_requested  &&  job->state < DRMAA_PS_DONE )
		 {
			bool signaled = true;
			fsd_log_debug(( "fsd_drmaa_session_wait_for_single_job: "
						"waiting for %s to terminate", job_id ));
			if( self->enable_wait_thread )
			 {
				if( timeout )
					signaled = fsd_cond_timedwait(
							&job->status_cond, &job->mutex, timeout );
				else
				 {
					fsd_cond_wait( &job->status_cond, &job->mutex );
				 }
				if( !signaled )
					fsd_exc_raise_code( FSD_DRMAA_ERRNO_EXIT_TIMEOUT );
			 }
			else
			 {
				self->wait_for_job_status_change(
						self, &job->status_cond, &job->mutex, timeout );
			 }

			fsd_log_debug(( "fsd_drmaa_session_wait_for_single_job: woken up" ));
			if( !self->enable_wait_thread )
				job->update_status( job );
		 }

		if( self->destroy_requested )
			fsd_exc_raise_code( FSD_DRMAA_ERRNO_EXIT_TIMEOUT );

		job->get_termination_status( job, status, rusage );
		if( dispose )
		 {
			job->release( job ); /*release mutex in order to ensure proper order of locking: first job_set mutex then job mutex */

			locked = fsd_mutex_lock( &self->mutex );

			job = self->get_job( self, job_id );
			if (job != NULL)
			 {
				self->jobs->remove( self->jobs, job );
				job->flags |= FSD_JOB_DISPOSED;
			 }
			else
			 {
				fsd_log_error(("Some other thread has already reaped job %s", job_id ));
			 }

			locked = fsd_mutex_unlock( &self->mutex );
		 }
	 }
	FINALLY
	 {
		if ( job )
			job->release( job );
		if ( locked )
			fsd_mutex_unlock( &self->mutex );
	 }
	END_TRY
	fsd_log_return((""));
}