/* * slurm_update_job2 - issue RPC to a job's configuration per request, * only usable by user root or (for some parameters) the job's owner * IN job_msg - description of job updates * OUT resp - per task response to the request, * free using slurm_free_job_array_resp() * RET SLURM_SUCCESS on success, otherwise return SLURM_ERROR with errno set */ extern int slurm_update_job2 (job_desc_msg_t * job_msg, job_array_resp_msg_t **resp) { int rc = SLURM_SUCCESS; slurm_msg_t req_msg, resp_msg; slurmdb_cluster_rec_t *save_working_cluster_rec = working_cluster_rec; slurm_msg_t_init(&req_msg); req_msg.msg_type = REQUEST_UPDATE_JOB; req_msg.data = job_msg; tryagain: slurm_msg_t_init(&resp_msg); rc = slurm_send_recv_controller_msg(&req_msg, &resp_msg, working_cluster_rec); switch (resp_msg.msg_type) { case RESPONSE_SLURM_REROUTE_MSG: { reroute_msg_t *rr_msg = (reroute_msg_t *)resp_msg.data; /* Don't expect mutliple hops but in the case it does * happen, free the previous rr cluster_rec. */ if (working_cluster_rec && working_cluster_rec != save_working_cluster_rec) slurmdb_destroy_cluster_rec( working_cluster_rec); working_cluster_rec = rr_msg->working_cluster_rec; slurmdb_setup_cluster_rec(working_cluster_rec); rr_msg->working_cluster_rec = NULL; goto tryagain; } case RESPONSE_JOB_ARRAY_ERRORS: *resp = (job_array_resp_msg_t *) resp_msg.data; break; case RESPONSE_SLURM_RC: rc = ((return_code_msg_t *) resp_msg.data)->return_code; if (rc) slurm_seterrno(rc); break; default: slurm_seterrno(SLURM_UNEXPECTED_MSG_ERROR); } if (working_cluster_rec != save_working_cluster_rec) { slurmdb_destroy_cluster_rec(working_cluster_rec); working_cluster_rec = save_working_cluster_rec; } return rc; }
extern void slurm_setup_remote_working_cluster(resource_allocation_response_msg_t *msg) { xassert(msg); xassert(msg->working_cluster_rec); xassert(msg->node_list); xassert(msg->node_addr); if (working_cluster_rec) slurmdb_destroy_cluster_rec(working_cluster_rec); working_cluster_rec = (slurmdb_cluster_rec_t *)msg->working_cluster_rec; msg->working_cluster_rec = NULL; working_cluster_rec->plugin_id_select = select_get_plugin_id_pos(working_cluster_rec->plugin_id_select); slurm_set_addr(&working_cluster_rec->control_addr, working_cluster_rec->control_port, working_cluster_rec->control_host); if (setenvf(NULL, "SLURM_CLUSTER_NAME", "%s", working_cluster_rec->name) < 0) error("unable to set SLURM_CLUSTER_NAME in environment"); add_remote_nodes_to_conf_tbls(msg->node_list, msg->node_addr); }
extern int sacctmgr_modify_cluster(int argc, char **argv) { int rc = SLURM_SUCCESS; int i=0; slurmdb_cluster_rec_t *cluster = xmalloc(sizeof(slurmdb_cluster_rec_t)); slurmdb_assoc_rec_t *assoc = xmalloc(sizeof(slurmdb_assoc_rec_t)); slurmdb_assoc_cond_t *assoc_cond = xmalloc(sizeof(slurmdb_assoc_cond_t)); int cond_set = 0, prev_set = 0, rec_set = 0, set = 0; List ret_list = NULL; slurmdb_cluster_cond_t cluster_cond; bool existing_fed = false; slurmdb_init_assoc_rec(assoc, 0); assoc_cond->cluster_list = list_create(slurm_destroy_char); assoc_cond->acct_list = list_create(NULL); slurmdb_init_cluster_rec(cluster, 0); slurmdb_init_cluster_cond(&cluster_cond, 0); cluster_cond.cluster_list = assoc_cond->cluster_list; for (i=0; i<argc; i++) { int command_len = strlen(argv[i]); if (!strncasecmp(argv[i], "Where", MAX(command_len, 5))) { i++; prev_set = _set_cond(&i, argc, argv, &cluster_cond, NULL); cond_set |= prev_set; } else if (!strncasecmp(argv[i], "Set", MAX(command_len, 3))) { i++; prev_set = _set_rec(&i, argc, argv, NULL, assoc, cluster); rec_set |= prev_set; } else { prev_set = _set_cond(&i, argc, argv, &cluster_cond, NULL); cond_set |= prev_set; } } if (exit_code) { rc = SLURM_ERROR; goto end_it; } else if (!rec_set) { exit_code=1; fprintf(stderr, " You didn't give me anything to set\n"); rc = SLURM_ERROR; goto end_it; } else if (!cond_set) { if (!commit_check("You didn't set any conditions with 'WHERE'.\n" "Are you sure you want to continue?")) { printf("Aborted\n"); rc = SLURM_SUCCESS; goto end_it; } } if (cluster->fed.name && cluster->fed.name[0]) { int rc; /* Make sure federation exists. */ List fed_list = list_create(slurm_destroy_char); list_append(fed_list, xstrdup(cluster->fed.name)); rc = verify_federations_exist(fed_list); FREE_NULL_LIST(fed_list); if (rc) goto end_it; /* See if cluster is assigned to another federation already. */ if (list_count(cluster_cond.cluster_list)) { if (_verify_fed_clusters(cluster_cond.cluster_list, cluster->fed.name, &existing_fed)) goto end_it; else if (!list_count(cluster_cond.cluster_list)) { /* irrelevant changes have been removed and * nothing to change now. */ printf("Nothing to change\n"); rc = SLURM_ERROR; (void)rc; /* CLANG false positive */ goto end_it; } else if (existing_fed) { char *warning = "\nAre you sure you want to continue?"; if (!commit_check(warning)) { rc = SLURM_ERROR; (void)rc; /* CLANG false positive */ goto end_it; } } } } if (cond_set & 1) { List temp_list = NULL; temp_list = acct_storage_g_get_clusters(db_conn, my_uid, &cluster_cond); if (!temp_list) { exit_code=1; fprintf(stderr, " Problem getting clusters from database. " "Contact your admin.\n"); rc = SLURM_ERROR; goto end_it; } else if (!list_count(temp_list)) { fprintf(stderr, " Query didn't return any clusters.\n"); rc = SLURM_ERROR; goto end_it; } /* we are only looking for the clusters returned from this query, so we free the cluster_list and replace it */ FREE_NULL_LIST(assoc_cond->cluster_list); assoc_cond->cluster_list = temp_list; } printf(" Setting\n"); if (rec_set & CLUS_REC_SET) sacctmgr_print_cluster(cluster); if (rec_set & CLUS_ASSOC_SET) { printf(" Default Limits:\n"); sacctmgr_print_assoc_limits(assoc); } if (rec_set & CLUS_REC_SET) { notice_thread_init(); ret_list = acct_storage_g_modify_clusters( db_conn, my_uid, &cluster_cond, cluster); if (ret_list && list_count(ret_list)) { char *object = NULL; ListIterator itr = list_iterator_create(ret_list); printf(" Modified cluster...\n"); while((object = list_next(itr))) { printf(" %s\n", object); } list_iterator_destroy(itr); set = 1; } else if (ret_list) { printf(" Nothing modified\n"); rc = SLURM_ERROR; } else { exit_code=1; fprintf(stderr, " Error with request: %s\n", slurm_strerror(errno)); rc = SLURM_ERROR; } FREE_NULL_LIST(ret_list); notice_thread_fini(); } if (rec_set & CLUS_ASSOC_SET) { list_append(assoc_cond->acct_list, "root"); notice_thread_init(); ret_list = acct_storage_g_modify_assocs(db_conn, my_uid, assoc_cond, assoc); if (ret_list && list_count(ret_list)) { char *object = NULL; ListIterator itr = list_iterator_create(ret_list); printf(" Modified cluster defaults for " "associations...\n"); while((object = list_next(itr))) { printf(" %s\n", object); } list_iterator_destroy(itr); set = 1; } else if (ret_list) { printf(" Nothing modified\n"); rc = SLURM_ERROR; } else { exit_code=1; fprintf(stderr, " Error with request: %s\n", slurm_strerror(errno)); rc = SLURM_ERROR; } FREE_NULL_LIST(ret_list); notice_thread_fini(); } if (set) { if (commit_check("Would you like to commit changes?")) acct_storage_g_commit(db_conn, 1); else { printf(" Changes Discarded\n"); acct_storage_g_commit(db_conn, 0); } } end_it: slurmdb_destroy_assoc_cond(assoc_cond); slurmdb_destroy_assoc_rec(assoc); slurmdb_destroy_cluster_rec(cluster); return rc; }
extern int sacctmgr_add_cluster(int argc, char **argv) { int rc = SLURM_SUCCESS; int i = 0; slurmdb_cluster_rec_t *cluster = NULL; slurmdb_cluster_rec_t *start_cluster = xmalloc(sizeof(slurmdb_cluster_rec_t)); List name_list = list_create(slurm_destroy_char); List cluster_list = NULL; slurmdb_assoc_rec_t start_assoc; int limit_set = 0; ListIterator itr = NULL, itr_c = NULL; char *name = NULL; slurmdb_init_assoc_rec(&start_assoc, 0); slurmdb_init_cluster_rec(start_cluster, 0); for (i=0; i<argc; i++) { int command_len = strlen(argv[i]); if (!strncasecmp(argv[i], "Where", MAX(command_len, 5)) || !strncasecmp(argv[i], "Set", MAX(command_len, 3))) i++; limit_set += _set_rec(&i, argc, argv, name_list, &start_assoc, start_cluster); } if (exit_code) { FREE_NULL_LIST(name_list); slurmdb_destroy_cluster_rec(start_cluster); return SLURM_ERROR; } else if (!list_count(name_list)) { FREE_NULL_LIST(name_list); slurmdb_destroy_cluster_rec(start_cluster); exit_code=1; fprintf(stderr, " Need name of cluster to add.\n"); return SLURM_ERROR; } else { List temp_list = NULL; slurmdb_cluster_cond_t cluster_cond; slurmdb_init_cluster_cond(&cluster_cond, 0); cluster_cond.cluster_list = name_list; cluster_cond.classification = start_cluster->classification; temp_list = acct_storage_g_get_clusters(db_conn, my_uid, &cluster_cond); if (!temp_list) { exit_code=1; fprintf(stderr, " Problem getting clusters from database. " "Contact your admin.\n"); slurmdb_destroy_cluster_rec(start_cluster); return SLURM_ERROR; } itr_c = list_iterator_create(name_list); itr = list_iterator_create(temp_list); while((name = list_next(itr_c))) { slurmdb_cluster_rec_t *cluster_rec = NULL; list_iterator_reset(itr); while((cluster_rec = list_next(itr))) { if (!xstrcasecmp(cluster_rec->name, name)) break; } if (cluster_rec) { printf(" This cluster %s already exists. " "Not adding.\n", name); list_delete_item(itr_c); } } list_iterator_destroy(itr); list_iterator_destroy(itr_c); FREE_NULL_LIST(temp_list); if (!list_count(name_list)) { FREE_NULL_LIST(name_list); slurmdb_destroy_cluster_rec(start_cluster); return SLURM_ERROR; } } if (start_cluster->fed.name) { int rc; List fed_list = list_create(slurm_destroy_char); list_append(fed_list, xstrdup(start_cluster->fed.name)); rc = verify_federations_exist(fed_list); FREE_NULL_LIST(fed_list); if (rc) { slurmdb_destroy_cluster_rec(start_cluster); FREE_NULL_LIST(name_list); return SLURM_ERROR; } } printf(" Adding Cluster(s)\n"); cluster_list = list_create(slurmdb_destroy_cluster_rec); itr = list_iterator_create(name_list); while((name = list_next(itr))) { if (!name[0]) { exit_code=1; fprintf(stderr, " No blank names are " "allowed when adding.\n"); rc = SLURM_ERROR; continue; } cluster = xmalloc(sizeof(slurmdb_cluster_rec_t)); slurmdb_init_cluster_rec(cluster, 0); list_append(cluster_list, cluster); slurmdb_copy_cluster_rec(cluster, start_cluster); cluster->name = xstrdup(name); printf(" Name = %s\n", cluster->name); cluster->root_assoc = xmalloc(sizeof(slurmdb_assoc_rec_t)); slurmdb_init_assoc_rec(cluster->root_assoc, 0); cluster->root_assoc->def_qos_id = start_assoc.def_qos_id; cluster->root_assoc->shares_raw = start_assoc.shares_raw; slurmdb_copy_assoc_rec_limits( cluster->root_assoc, &start_assoc); } list_iterator_destroy(itr); FREE_NULL_LIST(name_list); if (limit_set) printf(" Setting\n"); if (limit_set & CLUS_REC_SET) sacctmgr_print_cluster(start_cluster); if (limit_set & CLUS_ASSOC_SET) { printf(" Default Limits:\n"); sacctmgr_print_assoc_limits(&start_assoc); FREE_NULL_LIST(start_assoc.qos_list); } slurmdb_destroy_cluster_rec(start_cluster); if (!list_count(cluster_list)) { printf(" Nothing new added.\n"); rc = SLURM_ERROR; goto end_it; } /* Since we are creating tables with add cluster that can't be rolled back. So we ask before hand if they are serious about it so we can rollback if needed. */ if (commit_check("Would you like to commit changes?")) { notice_thread_init(); rc = acct_storage_g_add_clusters(db_conn, my_uid, cluster_list); notice_thread_fini(); if (rc == SLURM_SUCCESS) { acct_storage_g_commit(db_conn, 1); } else { exit_code=1; fprintf(stderr, " Problem adding clusters: %s\n", slurm_strerror(rc)); /* this isn't really needed, but just to be safe */ acct_storage_g_commit(db_conn, 0); } } else { printf(" Changes Discarded\n"); /* this isn't really needed, but just to be safe */ acct_storage_g_commit(db_conn, 0); } end_it: FREE_NULL_LIST(cluster_list); return rc; }
fsd_iter_t * slurmdrmaa_session_run_bulk( fsd_drmaa_session_t *self, const fsd_template_t *jt, int start, int end, int incr ) { int ret = 0; unsigned i = 0; int job_id = 0; int task_id = 0; fsd_job_t *volatile job = NULL; volatile unsigned n_jobs = (end - start) / incr + 1; char ** volatile job_ids = fsd_calloc( job_ids, n_jobs + 1, char* ); volatile bool connection_lock = false; fsd_environ_t *volatile env = NULL; job_desc_msg_t job_desc; submit_response_msg_t *submit_response = NULL; slurmdrmaa_init_job_desc( &job_desc ); TRY { connection_lock = fsd_mutex_lock( &self->drm_connection_mutex ); slurmdrmaa_job_create_req( self, jt, (fsd_environ_t**)&env , &job_desc, 0 ); /* Create job array spec if more than 1 task */ if(n_jobs > 1) { fsd_calloc(job_desc.array_inx, ARRAY_INX_MAXLEN, char*); ret = snprintf(job_desc.array_inx, ARRAY_INX_MAXLEN, "%d-%d:%d", start, end, incr ); if (ret < 0 || ret >= ARRAY_INX_MAXLEN) { fsd_exc_raise_fmt(FSD_ERRNO_INTERNAL_ERROR, "snprintf: not enough memory"); } fsd_log_debug(("array job '%s' prepared", job_desc.array_inx)); } /* Submit the batch job */ if(slurm_submit_batch_job(&job_desc, &submit_response) != SLURM_SUCCESS){ fsd_exc_raise_fmt( FSD_ERRNO_INTERNAL_ERROR,"slurm_submit_batch_job: %s",slurm_strerror(slurm_get_errno())); } connection_lock = fsd_mutex_unlock( &self->drm_connection_mutex ); /* Watch each job in the array */ for (i = 0; i < n_jobs; ++i) { job_id = (int) submit_response->job_id; task_id = start + i*incr; if (n_jobs > 1) { /* Array job */ if (!working_cluster_rec) job_ids[i] = fsd_asprintf("%d_%d", job_id, task_id); /* .0*/ else job_ids[i] = fsd_asprintf("%d_%d.%s", job_id, task_id, working_cluster_rec->name); } else { /* Single job */ if (!working_cluster_rec) job_ids[i] = fsd_asprintf("%d", job_id); /* .0*/ else job_ids[i] = fsd_asprintf("%d.%s", job_id, working_cluster_rec->name); } fsd_log_debug(("job %s submitted", job_ids[i])); job = slurmdrmaa_job_new( fsd_strdup(job_ids[i]) ); job->session = self; job->submit_time = time(NULL); self->jobs->add( self->jobs, job ); job->release( job ); job = NULL; } if (working_cluster_rec) slurmdb_destroy_cluster_rec(working_cluster_rec); working_cluster_rec = NULL; }