void
fsd_job_get_termination_status( fsd_job_t *self,
			int *status, fsd_iter_t **rusage_out )
{
	fsd_iter_t* volatile rusage = NULL;

	TRY
	 {
		if( rusage_out )
		 {
			rusage = fsd_iter_new( NULL, 0 );
			rusage->append( rusage, fsd_asprintf(
						"submission_time=%ld", (long)self->submit_time ) );
			if (self->start_time)
				rusage->append( rusage, fsd_asprintf(
						"start_time=%ld", (long)self->start_time ) );
			if (self->end_time)
				rusage->append( rusage, fsd_asprintf(
						"end_time=%ld", (long)self->end_time ) );

			rusage->append( rusage, fsd_asprintf(
						"cpu=%ld", self->cpu_usage ) );
			rusage->append( rusage, fsd_asprintf(
						"mem=%ld", self->mem_usage ) );
			rusage->append( rusage, fsd_asprintf(
						"vmem=%ld", self->vmem_usage ) );
			rusage->append( rusage, fsd_asprintf(
						"walltime=%ld", self->walltime ) );
			rusage->append( rusage, fsd_asprintf(
						"hosts=%s", self->execution_hosts ) );

			if (self->queue) {
				rusage->append( rusage, fsd_asprintf("queue=%s", self->queue ) );
			}

			if (self->project) {
				rusage->append( rusage, fsd_asprintf("project=%s", self->project ) );
			}
		 }
	 }
	EXCEPT_DEFAULT
	 {
		if( rusage )
			rusage->destroy( rusage );
		if( rusage_out )
			*rusage_out = NULL;
		fsd_exc_reraise();
	 }
	ELSE
	 {
		if( status )
			*status = self->exit_status;
		if( rusage_out )
			*rusage_out = rusage;
	 }
	END_TRY
}
Exemplo n.º 2
0
void
slurmdrmaa_job_create_req(
		fsd_drmaa_session_t *session,
		const fsd_template_t *jt,
		fsd_environ_t **envp,
		job_desc_msg_t * job_desc,
		int n_job /* ~job_step */
		)
{
	fsd_expand_drmaa_ph_t *volatile expand = NULL;

	TRY
	 {
		expand = fsd_expand_drmaa_ph_new( NULL, NULL, fsd_asprintf("%d",n_job) );
		slurmdrmaa_job_create( session, jt, envp, expand, job_desc, n_job);
	 }
	EXCEPT_DEFAULT
	 {
		fsd_exc_reraise();
	 }
	FINALLY
	 {
		if( expand )
			expand->destroy( expand );
	 }
	END_TRY
}
Exemplo n.º 3
0
void
slurmdrmaa_job_create(
		fsd_drmaa_session_t *session,
		const fsd_template_t *jt,
		fsd_environ_t **envp,
		fsd_expand_drmaa_ph_t *expand, 
		job_desc_msg_t * job_desc,
		int n_job
		)
{
	const char *input_path_orig = NULL;
	const char *output_path_orig = NULL;
	const char *error_path_orig = NULL;
	char *volatile input_path = NULL;
	char *volatile output_path = NULL;
	char *volatile error_path = NULL;
	bool input_host = false;
	bool output_host = false;
	bool error_host = false;
	bool join_files = false;
	const char *value;
	const char *const *vector;
	const char *job_category = "default";
	
	job_desc->user_id = getuid();
	job_desc->group_id = getgid();

	job_desc->env_size = 0;
	
	/* job name */
	value = jt->get_attr( jt, DRMAA_JOB_NAME );
	if( value )
	{
		job_desc->name = fsd_strdup(value);
		fsd_log_debug(("# job_name = %s",job_desc->name));
	}
	
	/* job state at submit */
	value = jt->get_attr( jt, DRMAA_JS_STATE );
	if( value )
	{
		if( 0 == strcmp( value, DRMAA_SUBMISSION_STATE_ACTIVE ) )
		{}
		else if( 0 == strcmp( value, DRMAA_SUBMISSION_STATE_HOLD ) )
		{
			job_desc->priority = 0;
			fsd_log_debug(("# hold = user"));
		}
		else
		{
			fsd_exc_raise_msg(FSD_DRMAA_ERRNO_INVALID_ATTRIBUTE_VALUE, "invalid value of drmaa_js_state attribute" );
		}
	}
	
	TRY
	{
		const char *command = NULL;
		char *command_expanded = NULL;
		char *temp_script_old = NULL;
		char *temp_script = "";
		const char *const *i;
		int j;

		/* remote command */
		command = jt->get_attr( jt, DRMAA_REMOTE_COMMAND );
		if( command == NULL )
			fsd_exc_raise_msg(
					FSD_DRMAA_ERRNO_CONFLICTING_ATTRIBUTE_VALUES,
					"drmaa_remote_command not set for job template"
					);

		command_expanded = expand->expand( expand, fsd_strdup(command), FSD_DRMAA_PH_HD | FSD_DRMAA_PH_WD );

		temp_script = fsd_asprintf("#!/bin/bash\n%s",command_expanded);
		fsd_free(command_expanded);

		/* arguments list */
		vector = jt->get_v_attr( jt, DRMAA_V_ARGV );

		if( vector )
	 	{
			for( i = vector, j = 2;  *i;  i++, j++ )
			{
				char *arg_expanded = expand->expand( expand, fsd_strdup(*i), FSD_DRMAA_PH_HD | FSD_DRMAA_PH_WD );
				
				temp_script_old = fsd_strdup(temp_script);
				
				if (strcmp(temp_script, "") != 0) {
					fsd_free(temp_script);
				}
				/* add too script */
				temp_script = fsd_asprintf("%s '%s'", temp_script_old, arg_expanded);
				fsd_free(temp_script_old);
				fsd_free(arg_expanded);
			}
		}
		
		job_desc->script = fsd_asprintf("%s\n", temp_script);
		fsd_log_debug(("# Script:\n%s", job_desc->script));
		fsd_free(temp_script);
	}
	END_TRY
	

	/* start time */
	value = jt->get_attr( jt, DRMAA_START_TIME );
	if( value )
 	{ 
		job_desc->begin_time = fsd_datetime_parse( value );
		fsd_log_debug(( "\n  drmaa_start_time: %s -> %ld", value, (long)job_desc->begin_time));
	}

	/*  propagate all environment variables from submission host */
	{
		extern char **environ;
		char **i;
		unsigned j = 0;

		for ( i = environ; *i; i++) {
			job_desc->env_size++;
		}
		
		fsd_log_debug(("environ env_size = %d",job_desc->env_size));
		fsd_calloc(job_desc->environment, job_desc->env_size+1, char *);
		
		for ( i = environ; *i; i++,j++ ) {
			job_desc->environment[j] = fsd_strdup(*i);
		}
	}

	/* environment */
	
	vector = jt->get_v_attr( jt, DRMAA_V_ENV );
	if( vector )
	{
		const char *const *i;
		unsigned j = 0;
		unsigned env_offset = job_desc->env_size;

		for( i = vector;  *i;  i++ )
 		{
			job_desc->env_size++;
		}
		fsd_log_debug(("jt env_size = %d",job_desc->env_size));

		fsd_log_debug(("# environment ="));
		fsd_realloc(job_desc->environment, job_desc->env_size+1, char *);

		for( i = vector;  *i;  i++,j++ )
 		{
			job_desc->environment[j + env_offset] = fsd_strdup(*i);
			fsd_log_debug((" %s", job_desc->environment[j+ env_offset]));
		}
	 }
	
 	/* wall clock time hard limit */
	value = jt->get_attr( jt, DRMAA_WCT_HLIMIT );
	if (value)
	{
		job_desc->time_limit = slurmdrmaa_datetime_parse( value );
		fsd_log_debug(("# wct_hlimit = %s -> %ld",value, (long int)slurmdrmaa_datetime_parse( value )));
	}

		
	/*expand->set(expand, FSD_DRMAA_PH_INCR,fsd_asprintf("%d", n_job));*/ /* set current value */
	/* TODO: test drmaa_ph_incr */
	/* job working directory */
	value = jt->get_attr( jt, DRMAA_WD );
	if( value )
	{
		char *cwd_expanded = expand->expand( expand, fsd_strdup(value), FSD_DRMAA_PH_HD | FSD_DRMAA_PH_INCR );

		expand->set( expand, FSD_DRMAA_PH_WD, fsd_strdup(cwd_expanded));

		fsd_log_debug(("# work_dir = %s",cwd_expanded));
		job_desc->work_dir = fsd_strdup(cwd_expanded);
		fsd_free(cwd_expanded);
	}
	else
	{
		char cwdbuf[4096] = "";

		if ((getcwd(cwdbuf, 4095)) == NULL) {
			char errbuf[256] = "InternalError";
			(void)strerror_r(errno, errbuf, 256); /*on error the default message would be returned */
			fsd_log_error(("getcwd failed: %s", errbuf));
			job_desc->work_dir = fsd_strdup(".");
		} else {
			job_desc->work_dir = fsd_strdup(cwdbuf);
		}

		fsd_log_debug(("work_dir(default:CWD) %s", job_desc->work_dir));
	}

	TRY
 	{
		/* input path */
		input_path_orig = jt->get_attr( jt, DRMAA_INPUT_PATH );
		if( input_path_orig )
		{
			input_path = internal_map_file( expand, input_path_orig, &input_host,"input" );
			fsd_log_debug(( "\n  drmaa_input_path: %s -> %s", input_path_orig, input_path ));
		}

		/* output path */
		output_path_orig = jt->get_attr( jt, DRMAA_OUTPUT_PATH );
		if( output_path_orig )
		{
			output_path = internal_map_file( expand, output_path_orig, &output_host,"output" );
			fsd_log_debug(( "\n  drmaa_output_path: %s -> %s", output_path_orig, output_path ));
		}

		/* error path */
		error_path_orig = jt->get_attr( jt, DRMAA_ERROR_PATH );
		if( error_path_orig )
		{
			error_path = internal_map_file( expand, error_path_orig, &error_host,"error" );
			fsd_log_debug(( "\n  drmaa_error_path: %s -> %s", error_path_orig, error_path ));
		}

		/* join files */
		value = jt->get_attr( jt, DRMAA_JOIN_FILES );
		if( value )
		{
			if( (value[0] == 'y' || value[0] == 'Y')  &&  value[1] == '\0' )
				join_files = true;
			else if( (value[0] == 'n' || value[0] == 'N')  &&  value[1] == '\0' )
				join_files = false;
			else
				fsd_exc_raise_msg(
						FSD_DRMAA_ERRNO_INVALID_ATTRIBUTE_VALUE,
						"invalid value of drmaa_join_files attribute" );
		}

		if( join_files )
		{
			if( output_path == NULL )
				fsd_exc_raise_msg(FSD_DRMAA_ERRNO_CONFLICTING_ATTRIBUTE_VALUES,	"drmaa_join_files is set and output file is not given" );
			if( error_path!=NULL && 0 != strcmp( output_path, error_path ) )
				fsd_log_warning(( "Error file was given but will be ignored since drmaa_join_files was set." ));

			if (error_path)
				fsd_free(error_path);

			 error_path = fsd_strdup(output_path);
		}
		else
		{
			if( error_path == NULL  &&  output_path )
				error_path = fsd_strdup( "/dev/null" );
			if( output_path == NULL  &&  error_path )
				output_path = fsd_strdup( "/dev/null" );
		}


		/* email addresses to send notifications */
		vector = jt->get_v_attr( jt, DRMAA_V_EMAIL );
		if( vector  &&  vector[0] )
		{
			/* only to one email address message may be send */
			job_desc->mail_user = fsd_strdup(vector[0]);
			job_desc->mail_type = MAIL_JOB_BEGIN | MAIL_JOB_END |  MAIL_JOB_FAIL;
			fsd_log_debug(("# mail_user = %s\n",vector[0]));
			fsd_log_debug(("# mail_type = %o\n",job_desc->mail_type));
			if( vector[1] != NULL )
			{
				fsd_log_error(( "SLURM only supports one e-mail notification address" ));
				fsd_exc_raise_msg(FSD_DRMAA_ERRNO_INVALID_ATTRIBUTE_VALUE,"SLURM only supports one e-mail notification address");
			}
		}

		/* block email */
		value = jt->get_attr( jt, DRMAA_BLOCK_EMAIL );
		if( value )
		{
			bool block;
			if( strcmp(value, "0") == 0 )
			{
				block = true;
				fsd_log_debug(("# block_email = true"));
				fsd_log_debug(("# mail_user delated"));
				fsd_free(job_desc->mail_user);
				job_desc->mail_user = NULL;
			}
			else if( strcmp(value, "1") == 0 )
				block = false;
			else
				fsd_exc_raise_msg(FSD_DRMAA_ERRNO_INVALID_ATTRIBUTE_VALUE,"invalid value of drmaa_block_email attribute" );

			if( block && output_path == NULL )
			{
				fsd_log_debug(( "output path not set and we want to block e-mail, set to /dev/null" ));
				output_path = fsd_strdup( "/dev/null" );
			}
		}

		if( input_path )
		{
			job_desc->std_in = fsd_strdup(input_path);
			fsd_log_debug(("# input = %s", input_path));
		}

		if( output_path )
		{
			job_desc->std_out = fsd_strdup(output_path);
			fsd_log_debug(("# output = %s", output_path));
		}

		if( error_path )
		{
			job_desc->std_err = fsd_strdup(error_path);
			fsd_log_debug(("# error = %s", error_path));
		}
	 }
	FINALLY
	{
		fsd_free( input_path );
		fsd_free( output_path );
		fsd_free( error_path );
		input_path = NULL;
		output_path = NULL;
		error_path = NULL;
	}
	END_TRY			
	
	
	/* job category */
	value = jt->get_attr( jt, DRMAA_JOB_CATEGORY );
	if( value )
		job_category = value;

	{
		fsd_conf_option_t *category_value = NULL;
		category_value = fsd_conf_dict_get( session->job_categories, job_category );

		if( category_value != NULL )
	 	{
			if( category_value->type != FSD_CONF_STRING )
				fsd_exc_raise_fmt(
						FSD_ERRNO_INTERNAL_ERROR,
						"configuration error: job category should be string"
						);

			fsd_log_debug(("# Job category %s : %s\n",value,category_value->val.string));			
			slurmdrmaa_parse_native(job_desc,category_value->val.string);			
	 	}
		else
	 	{
			if( value != NULL )
				fsd_exc_raise_fmt(
						FSD_DRMAA_ERRNO_INVALID_ATTRIBUTE_VALUE,
						"invalid job category: %s", job_category
						);
	 	}
 	}

    /* set defaults for constraints - ref: slurm.h */
    fsd_log_debug(("# Setting defaults for tasks and processors" ));
    job_desc->num_tasks = 1;
    job_desc->min_cpus = 0;
    job_desc->cpus_per_task = 0;
    job_desc->pn_min_cpus = 0;

	/* native specification */
	value = jt->get_attr( jt, DRMAA_NATIVE_SPECIFICATION );
	if( value )
	{
		fsd_log_debug(("# Native specification: %s\n", value));
		slurmdrmaa_parse_native(job_desc, value);
	}
	
}
Exemplo n.º 4
0
fsd_iter_t *
slurmdrmaa_session_run_bulk(
		fsd_drmaa_session_t *self,
		const fsd_template_t *jt,
		int start, int end, int incr )
{
	int ret = 0;
	unsigned i = 0;
	int job_id = 0;
	int task_id = 0;
	fsd_job_t *volatile job = NULL;
	volatile unsigned n_jobs = (end - start) / incr + 1;
	char ** volatile job_ids = fsd_calloc( job_ids, n_jobs + 1, char* );
	volatile bool connection_lock = false;
	fsd_environ_t *volatile env = NULL;
	job_desc_msg_t job_desc;
	submit_response_msg_t *submit_response = NULL;

	slurmdrmaa_init_job_desc( &job_desc );

	TRY
	{
			connection_lock = fsd_mutex_lock( &self->drm_connection_mutex );
			slurmdrmaa_job_create_req( self, jt, (fsd_environ_t**)&env , &job_desc, 0 );

			/* Create job array spec if more than 1 task */
			if(n_jobs > 1)
			{
				fsd_calloc(job_desc.array_inx, ARRAY_INX_MAXLEN, char*);
				ret = snprintf(job_desc.array_inx, ARRAY_INX_MAXLEN, "%d-%d:%d", start, end, incr );
				if (ret < 0 || ret >= ARRAY_INX_MAXLEN) {
					fsd_exc_raise_fmt(FSD_ERRNO_INTERNAL_ERROR, "snprintf: not enough memory");
				}
				fsd_log_debug(("array job '%s' prepared", job_desc.array_inx));
			}

			/* Submit the batch job */
			if(slurm_submit_batch_job(&job_desc, &submit_response) != SLURM_SUCCESS){
				fsd_exc_raise_fmt(
					FSD_ERRNO_INTERNAL_ERROR,"slurm_submit_batch_job: %s",slurm_strerror(slurm_get_errno()));
			}

			connection_lock = fsd_mutex_unlock( &self->drm_connection_mutex );

			/* Watch each job in the array */
			for (i = 0; i < n_jobs; ++i) {
				job_id = (int) submit_response->job_id;
				task_id = start + i*incr;
				if (n_jobs > 1) {
					/* Array job */
					if (!working_cluster_rec)
						job_ids[i] = fsd_asprintf("%d_%d", job_id, task_id); /* .0*/
					else
						job_ids[i] = fsd_asprintf("%d_%d.%s", job_id, task_id, working_cluster_rec->name);
				} else {
					/* Single job */
					if (!working_cluster_rec)
						job_ids[i] = fsd_asprintf("%d", job_id); /* .0*/
					else
						job_ids[i] = fsd_asprintf("%d.%s", job_id, working_cluster_rec->name);
				}

				fsd_log_debug(("job %s submitted", job_ids[i]));
				job = slurmdrmaa_job_new( fsd_strdup(job_ids[i]) );
				job->session = self;
				job->submit_time = time(NULL);
				self->jobs->add( self->jobs, job );
				job->release( job );
				job = NULL;
			}

			if (working_cluster_rec)
				slurmdb_destroy_cluster_rec(working_cluster_rec);

			working_cluster_rec = NULL;
	 }