Example #1
0
/*
 * slurm_update_job2 - issue RPC to a job's configuration per request,
 *	only usable by user root or (for some parameters) the job's owner
 * IN job_msg - description of job updates
 * OUT resp - per task response to the request,
 *	      free using slurm_free_job_array_resp()
 * RET SLURM_SUCCESS on success, otherwise return SLURM_ERROR with errno set
 */
extern int
slurm_update_job2 (job_desc_msg_t * job_msg, job_array_resp_msg_t **resp)
{
	int rc = SLURM_SUCCESS;
	slurm_msg_t req_msg, resp_msg;
	slurmdb_cluster_rec_t *save_working_cluster_rec = working_cluster_rec;

	slurm_msg_t_init(&req_msg);
	req_msg.msg_type	= REQUEST_UPDATE_JOB;
	req_msg.data		= job_msg;

tryagain:
	slurm_msg_t_init(&resp_msg);

	rc = slurm_send_recv_controller_msg(&req_msg, &resp_msg,
					    working_cluster_rec);
	switch (resp_msg.msg_type) {
	case RESPONSE_SLURM_REROUTE_MSG:
	{
		reroute_msg_t *rr_msg = (reroute_msg_t *)resp_msg.data;

		/* Don't expect mutliple hops but in the case it does
		 * happen, free the previous rr cluster_rec. */
		if (working_cluster_rec &&
		    working_cluster_rec != save_working_cluster_rec)
			slurmdb_destroy_cluster_rec(
						working_cluster_rec);

		working_cluster_rec = rr_msg->working_cluster_rec;
		slurmdb_setup_cluster_rec(working_cluster_rec);
		rr_msg->working_cluster_rec = NULL;
		goto tryagain;
	}
	case RESPONSE_JOB_ARRAY_ERRORS:
		*resp = (job_array_resp_msg_t *) resp_msg.data;
		break;
	case RESPONSE_SLURM_RC:
		rc = ((return_code_msg_t *) resp_msg.data)->return_code;
		if (rc)
			slurm_seterrno(rc);
		break;
	default:
		slurm_seterrno(SLURM_UNEXPECTED_MSG_ERROR);
	}

	if (working_cluster_rec != save_working_cluster_rec) {
		slurmdb_destroy_cluster_rec(working_cluster_rec);
		working_cluster_rec = save_working_cluster_rec;
	}

	return rc;
}
Example #2
0
extern void
slurm_setup_remote_working_cluster(resource_allocation_response_msg_t *msg)
{
	xassert(msg);
	xassert(msg->working_cluster_rec);
	xassert(msg->node_list);
	xassert(msg->node_addr);

	if (working_cluster_rec)
		slurmdb_destroy_cluster_rec(working_cluster_rec);

	working_cluster_rec = (slurmdb_cluster_rec_t *)msg->working_cluster_rec;
	msg->working_cluster_rec = NULL;

	working_cluster_rec->plugin_id_select =
		select_get_plugin_id_pos(working_cluster_rec->plugin_id_select);

	slurm_set_addr(&working_cluster_rec->control_addr,
		       working_cluster_rec->control_port,
		       working_cluster_rec->control_host);

	if (setenvf(NULL, "SLURM_CLUSTER_NAME", "%s",
		    working_cluster_rec->name) < 0)
		error("unable to set SLURM_CLUSTER_NAME in environment");

	add_remote_nodes_to_conf_tbls(msg->node_list, msg->node_addr);
}
Example #3
0
extern int sacctmgr_modify_cluster(int argc, char **argv)
{
	int rc = SLURM_SUCCESS;
	int i=0;
	slurmdb_cluster_rec_t *cluster =
		xmalloc(sizeof(slurmdb_cluster_rec_t));
	slurmdb_assoc_rec_t *assoc =
		xmalloc(sizeof(slurmdb_assoc_rec_t));
	slurmdb_assoc_cond_t *assoc_cond =
		xmalloc(sizeof(slurmdb_assoc_cond_t));
	int cond_set = 0, prev_set = 0, rec_set = 0, set = 0;
	List ret_list = NULL;
	slurmdb_cluster_cond_t cluster_cond;
	bool existing_fed = false;

	slurmdb_init_assoc_rec(assoc, 0);

	assoc_cond->cluster_list = list_create(slurm_destroy_char);
	assoc_cond->acct_list = list_create(NULL);

	slurmdb_init_cluster_rec(cluster, 0);
	slurmdb_init_cluster_cond(&cluster_cond, 0);
	cluster_cond.cluster_list = assoc_cond->cluster_list;

	for (i=0; i<argc; i++) {
		int command_len = strlen(argv[i]);
		if (!strncasecmp(argv[i], "Where", MAX(command_len, 5))) {
			i++;
			prev_set = _set_cond(&i, argc, argv,
					     &cluster_cond, NULL);
			cond_set |= prev_set;
		} else if (!strncasecmp(argv[i], "Set", MAX(command_len, 3))) {
			i++;
			prev_set = _set_rec(&i, argc, argv,
					    NULL, assoc, cluster);
			rec_set |= prev_set;
		} else {
			prev_set = _set_cond(&i, argc, argv,
					     &cluster_cond, NULL);
			cond_set |= prev_set;
		}
	}

	if (exit_code) {
		rc = SLURM_ERROR;
		goto end_it;
	} else if (!rec_set) {
		exit_code=1;
		fprintf(stderr, " You didn't give me anything to set\n");
		rc = SLURM_ERROR;
		goto end_it;
	} else if (!cond_set) {
		if (!commit_check("You didn't set any conditions with 'WHERE'.\n"
				 "Are you sure you want to continue?")) {
			printf("Aborted\n");
			rc = SLURM_SUCCESS;
			goto end_it;
		}
	}

	if (cluster->fed.name && cluster->fed.name[0]) {
		int rc;
		/* Make sure federation exists. */
		List fed_list = list_create(slurm_destroy_char);
		list_append(fed_list, xstrdup(cluster->fed.name));
		rc = verify_federations_exist(fed_list);
		FREE_NULL_LIST(fed_list);
		if (rc)
			goto end_it;

		/* See if cluster is assigned to another federation already. */
		if (list_count(cluster_cond.cluster_list)) {
			if (_verify_fed_clusters(cluster_cond.cluster_list,
						 cluster->fed.name,
						 &existing_fed))
					goto end_it;
			else if (!list_count(cluster_cond.cluster_list)) {
				/* irrelevant changes have been removed and
				 * nothing to change now. */
				printf("Nothing to change\n");
				rc = SLURM_ERROR;
				(void)rc; /* CLANG false positive */
				goto end_it;
			} else if (existing_fed) {
				char *warning =
					"\nAre you sure you want to continue?";
				if (!commit_check(warning)) {
					rc = SLURM_ERROR;
					(void)rc; /* CLANG false positive */
					goto end_it;
				}
			}
		}
	}

	if (cond_set & 1) {
		List temp_list = NULL;

		temp_list = acct_storage_g_get_clusters(db_conn, my_uid,
							&cluster_cond);
		if (!temp_list) {
			exit_code=1;
			fprintf(stderr,
				" Problem getting clusters from database.  "
				"Contact your admin.\n");
			rc = SLURM_ERROR;
			goto end_it;
		} else if (!list_count(temp_list)) {
			fprintf(stderr,
				" Query didn't return any clusters.\n");
			rc = SLURM_ERROR;
			goto end_it;
		}
		/* we are only looking for the clusters returned from
		   this query, so we free the cluster_list and replace
		   it */
		FREE_NULL_LIST(assoc_cond->cluster_list);
		assoc_cond->cluster_list = temp_list;
	}

	printf(" Setting\n");
	if (rec_set & CLUS_REC_SET)
		sacctmgr_print_cluster(cluster);
	if (rec_set & CLUS_ASSOC_SET) {
		printf("  Default Limits:\n");
		sacctmgr_print_assoc_limits(assoc);
	}

	if (rec_set & CLUS_REC_SET) {
		notice_thread_init();
		ret_list = acct_storage_g_modify_clusters(
			db_conn, my_uid, &cluster_cond, cluster);

		if (ret_list && list_count(ret_list)) {
			char *object = NULL;
			ListIterator itr = list_iterator_create(ret_list);
			printf(" Modified cluster...\n");
			while((object = list_next(itr))) {
				printf("  %s\n", object);
			}
			list_iterator_destroy(itr);
			set = 1;
		} else if (ret_list) {
			printf(" Nothing modified\n");
			rc = SLURM_ERROR;
		} else {
			exit_code=1;
			fprintf(stderr, " Error with request: %s\n",
				slurm_strerror(errno));
			rc = SLURM_ERROR;
		}

		FREE_NULL_LIST(ret_list);
		notice_thread_fini();
	}

	if (rec_set & CLUS_ASSOC_SET) {
		list_append(assoc_cond->acct_list, "root");
		notice_thread_init();
		ret_list = acct_storage_g_modify_assocs(db_conn, my_uid,
							assoc_cond, assoc);

		if (ret_list && list_count(ret_list)) {
			char *object = NULL;
			ListIterator itr = list_iterator_create(ret_list);
			printf(" Modified cluster defaults for "
			       "associations...\n");
			while((object = list_next(itr))) {
				printf("  %s\n", object);
			}
			list_iterator_destroy(itr);
			set = 1;
		} else if (ret_list) {
			printf(" Nothing modified\n");
			rc = SLURM_ERROR;
		} else {
			exit_code=1;
			fprintf(stderr, " Error with request: %s\n",
				slurm_strerror(errno));
			rc = SLURM_ERROR;
		}
		FREE_NULL_LIST(ret_list);
		notice_thread_fini();
	}


	if (set) {
		if (commit_check("Would you like to commit changes?"))
			acct_storage_g_commit(db_conn, 1);
		else {
			printf(" Changes Discarded\n");
			acct_storage_g_commit(db_conn, 0);
		}
	}
end_it:
	slurmdb_destroy_assoc_cond(assoc_cond);
	slurmdb_destroy_assoc_rec(assoc);
	slurmdb_destroy_cluster_rec(cluster);

	return rc;
}
Example #4
0
extern int sacctmgr_add_cluster(int argc, char **argv)
{
	int rc = SLURM_SUCCESS;
	int i = 0;
	slurmdb_cluster_rec_t *cluster = NULL;
	slurmdb_cluster_rec_t *start_cluster =
		xmalloc(sizeof(slurmdb_cluster_rec_t));
	List name_list = list_create(slurm_destroy_char);
	List cluster_list = NULL;
	slurmdb_assoc_rec_t start_assoc;

	int limit_set = 0;
	ListIterator itr = NULL, itr_c = NULL;
	char *name = NULL;

	slurmdb_init_assoc_rec(&start_assoc, 0);
	slurmdb_init_cluster_rec(start_cluster, 0);

	for (i=0; i<argc; i++) {
		int command_len = strlen(argv[i]);
		if (!strncasecmp(argv[i], "Where", MAX(command_len, 5))
		    || !strncasecmp(argv[i], "Set", MAX(command_len, 3)))
			i++;
		limit_set += _set_rec(&i, argc, argv,
				      name_list, &start_assoc, start_cluster);
	}
	if (exit_code) {
		FREE_NULL_LIST(name_list);
		slurmdb_destroy_cluster_rec(start_cluster);
		return SLURM_ERROR;
	} else if (!list_count(name_list)) {
		FREE_NULL_LIST(name_list);
		slurmdb_destroy_cluster_rec(start_cluster);
		exit_code=1;
		fprintf(stderr, " Need name of cluster to add.\n");
		return SLURM_ERROR;
	} else {
		List temp_list = NULL;
		slurmdb_cluster_cond_t cluster_cond;

		slurmdb_init_cluster_cond(&cluster_cond, 0);
		cluster_cond.cluster_list = name_list;
		cluster_cond.classification = start_cluster->classification;

		temp_list = acct_storage_g_get_clusters(db_conn, my_uid,
							&cluster_cond);
		if (!temp_list) {
			exit_code=1;
			fprintf(stderr,
				" Problem getting clusters from database.  "
				"Contact your admin.\n");
			slurmdb_destroy_cluster_rec(start_cluster);
			return SLURM_ERROR;
		}

		itr_c = list_iterator_create(name_list);
		itr = list_iterator_create(temp_list);
		while((name = list_next(itr_c))) {
			slurmdb_cluster_rec_t *cluster_rec = NULL;

			list_iterator_reset(itr);
			while((cluster_rec = list_next(itr))) {
				if (!xstrcasecmp(cluster_rec->name, name))
					break;
			}
			if (cluster_rec) {
				printf(" This cluster %s already exists.  "
				       "Not adding.\n", name);
				list_delete_item(itr_c);
			}
		}
		list_iterator_destroy(itr);
		list_iterator_destroy(itr_c);
		FREE_NULL_LIST(temp_list);
		if (!list_count(name_list)) {
			FREE_NULL_LIST(name_list);
			slurmdb_destroy_cluster_rec(start_cluster);
			return SLURM_ERROR;
		}
	}

	if (start_cluster->fed.name) {
		int rc;
		List fed_list = list_create(slurm_destroy_char);
		list_append(fed_list, xstrdup(start_cluster->fed.name));
		rc = verify_federations_exist(fed_list);
		FREE_NULL_LIST(fed_list);
		if (rc) {
			slurmdb_destroy_cluster_rec(start_cluster);
			FREE_NULL_LIST(name_list);
			return SLURM_ERROR;
		}
	}

	printf(" Adding Cluster(s)\n");
	cluster_list = list_create(slurmdb_destroy_cluster_rec);
	itr = list_iterator_create(name_list);
	while((name = list_next(itr))) {
		if (!name[0]) {
			exit_code=1;
			fprintf(stderr, " No blank names are "
				"allowed when adding.\n");
			rc = SLURM_ERROR;
			continue;
		}
		cluster = xmalloc(sizeof(slurmdb_cluster_rec_t));
		slurmdb_init_cluster_rec(cluster, 0);

		list_append(cluster_list, cluster);
		slurmdb_copy_cluster_rec(cluster, start_cluster);
		cluster->name = xstrdup(name);
		printf("  Name           = %s\n", cluster->name);

		cluster->root_assoc =
			xmalloc(sizeof(slurmdb_assoc_rec_t));
		slurmdb_init_assoc_rec(cluster->root_assoc, 0);
		cluster->root_assoc->def_qos_id = start_assoc.def_qos_id;
		cluster->root_assoc->shares_raw = start_assoc.shares_raw;

		slurmdb_copy_assoc_rec_limits(
			cluster->root_assoc, &start_assoc);
	}
	list_iterator_destroy(itr);
	FREE_NULL_LIST(name_list);

	if (limit_set)
		printf(" Setting\n");
	if (limit_set & CLUS_REC_SET)
		sacctmgr_print_cluster(start_cluster);
	if (limit_set & CLUS_ASSOC_SET) {
		printf("  Default Limits:\n");
		sacctmgr_print_assoc_limits(&start_assoc);
		FREE_NULL_LIST(start_assoc.qos_list);
	}
	slurmdb_destroy_cluster_rec(start_cluster);

	if (!list_count(cluster_list)) {
		printf(" Nothing new added.\n");
		rc = SLURM_ERROR;
		goto end_it;
	}

	/* Since we are creating tables with add cluster that can't be
	   rolled back.  So we ask before hand if they are serious
	   about it so we can rollback if needed.
	*/
	if (commit_check("Would you like to commit changes?")) {
		notice_thread_init();
		rc = acct_storage_g_add_clusters(db_conn, my_uid, cluster_list);
		notice_thread_fini();
		if (rc == SLURM_SUCCESS) {
			acct_storage_g_commit(db_conn, 1);
		} else {
			exit_code=1;
			fprintf(stderr, " Problem adding clusters: %s\n",
				slurm_strerror(rc));
			/* this isn't really needed, but just to be safe */
			acct_storage_g_commit(db_conn, 0);
		}
	} else {
		printf(" Changes Discarded\n");
		/* this isn't really needed, but just to be safe */
		acct_storage_g_commit(db_conn, 0);
	}

end_it:
	FREE_NULL_LIST(cluster_list);

	return rc;
}
Example #5
0
fsd_iter_t *
slurmdrmaa_session_run_bulk(
		fsd_drmaa_session_t *self,
		const fsd_template_t *jt,
		int start, int end, int incr )
{
	int ret = 0;
	unsigned i = 0;
	int job_id = 0;
	int task_id = 0;
	fsd_job_t *volatile job = NULL;
	volatile unsigned n_jobs = (end - start) / incr + 1;
	char ** volatile job_ids = fsd_calloc( job_ids, n_jobs + 1, char* );
	volatile bool connection_lock = false;
	fsd_environ_t *volatile env = NULL;
	job_desc_msg_t job_desc;
	submit_response_msg_t *submit_response = NULL;

	slurmdrmaa_init_job_desc( &job_desc );

	TRY
	{
			connection_lock = fsd_mutex_lock( &self->drm_connection_mutex );
			slurmdrmaa_job_create_req( self, jt, (fsd_environ_t**)&env , &job_desc, 0 );

			/* Create job array spec if more than 1 task */
			if(n_jobs > 1)
			{
				fsd_calloc(job_desc.array_inx, ARRAY_INX_MAXLEN, char*);
				ret = snprintf(job_desc.array_inx, ARRAY_INX_MAXLEN, "%d-%d:%d", start, end, incr );
				if (ret < 0 || ret >= ARRAY_INX_MAXLEN) {
					fsd_exc_raise_fmt(FSD_ERRNO_INTERNAL_ERROR, "snprintf: not enough memory");
				}
				fsd_log_debug(("array job '%s' prepared", job_desc.array_inx));
			}

			/* Submit the batch job */
			if(slurm_submit_batch_job(&job_desc, &submit_response) != SLURM_SUCCESS){
				fsd_exc_raise_fmt(
					FSD_ERRNO_INTERNAL_ERROR,"slurm_submit_batch_job: %s",slurm_strerror(slurm_get_errno()));
			}

			connection_lock = fsd_mutex_unlock( &self->drm_connection_mutex );

			/* Watch each job in the array */
			for (i = 0; i < n_jobs; ++i) {
				job_id = (int) submit_response->job_id;
				task_id = start + i*incr;
				if (n_jobs > 1) {
					/* Array job */
					if (!working_cluster_rec)
						job_ids[i] = fsd_asprintf("%d_%d", job_id, task_id); /* .0*/
					else
						job_ids[i] = fsd_asprintf("%d_%d.%s", job_id, task_id, working_cluster_rec->name);
				} else {
					/* Single job */
					if (!working_cluster_rec)
						job_ids[i] = fsd_asprintf("%d", job_id); /* .0*/
					else
						job_ids[i] = fsd_asprintf("%d.%s", job_id, working_cluster_rec->name);
				}

				fsd_log_debug(("job %s submitted", job_ids[i]));
				job = slurmdrmaa_job_new( fsd_strdup(job_ids[i]) );
				job->session = self;
				job->submit_time = time(NULL);
				self->jobs->add( self->jobs, job );
				job->release( job );
				job = NULL;
			}

			if (working_cluster_rec)
				slurmdb_destroy_cluster_rec(working_cluster_rec);

			working_cluster_rec = NULL;
	 }