/** * basil_reserve - wrapper around rsvn_new. * @user: owner of the reservation * @batch_id: (numeric) job ID * @width: mppwidth (aprun -n) * @depth: mppdepth (aprun -d) * @nppn: mppnppn (aprun -N) * @mem_mb: mppmem (aprun -m) * @ns_head: list of requested mppnodes (will be freed if not NULL) * @accel_head: optional accelerator parameters * Returns reservation ID > 0 if ok, negative %enum basil_error on error. */ long basil_reserve(const char *user, const char *batch_id, uint32_t width, uint32_t depth, uint32_t nppn, uint32_t mem_mb, uint32_t nppcu, struct nodespec *ns_head, struct basil_accel_param *accel_head) { struct basil_reservation *rsvn; struct basil_parse_data bp = {0}; /* do not free mppnodes it is stored/freed in the rsvn struct */ char *mppnodes = ns_to_string(ns_head); long rc; free_nodespec(ns_head); rsvn = _rsvn_new(user, batch_id, width, depth, nppn, mem_mb, nppcu, mppnodes, accel_head); if (rsvn == NULL) return -BE_INTERNAL; bp.method = BM_reserve; bp.mdata.res = rsvn; bp.version = BV_1_0; /* * Rule: * - if *res->batch_id is set, we are using Basil 1.1 * - if *res->batch_id == '\0' we have to fall back to Basil 1.0 */ if (batch_id && *batch_id) bp.version = get_basil_version(); rc = basil_request(&bp); if (rc >= 0) rc = rsvn->rsvn_id; free_rsvn(rsvn); return rc; }
/* * Node-specifier lists */ void free_nodespec(struct nodespec *ns) { if (ns) { free_nodespec(ns->next); free(ns); } }
void free_rsvn(struct basil_reservation *r) { if (r) { rsvn_free_param(r->params); free_nodespec(r->rsvd_nodes); free(r); } }
/** * do_basil_reserve - create a BASIL reservation. * IN job_ptr - pointer to job which has just been allocated resources * RET 0 or error code, job will abort or be requeued on failure */ extern int do_basil_reserve(struct job_record *job_ptr) { struct nodespec *ns_head = NULL; uint16_t mppwidth = 0, mppdepth, mppnppn; uint32_t mppmem = 0, node_min_mem = 0; uint32_t resv_id; int i, first_bit, last_bit; hostlist_t hl; long rc; char *user, batch_id[16]; if (!job_ptr->job_resrcs || job_ptr->job_resrcs->nhosts == 0) return SLURM_SUCCESS; debug3("job #%u: %u nodes = %s, cpus=%u" , job_ptr->job_id, job_ptr->job_resrcs->nhosts, job_ptr->job_resrcs->nodes, job_ptr->job_resrcs->ncpus ); if (job_ptr->job_resrcs->node_bitmap == NULL) { error("job %u node_bitmap not set", job_ptr->job_id); return SLURM_SUCCESS; } first_bit = bit_ffs(job_ptr->job_resrcs->node_bitmap); last_bit = bit_fls(job_ptr->job_resrcs->node_bitmap); if (first_bit == -1 || last_bit == -1) return SLURM_SUCCESS; /* no nodes allocated */ mppdepth = MAX(1, job_ptr->details->cpus_per_task); mppnppn = job_ptr->details->ntasks_per_node; /* mppmem */ if (job_ptr->details->pn_min_memory & MEM_PER_CPU) { /* Only honour --mem-per-cpu if --ntasks has been given */ if (job_ptr->details->num_tasks) mppmem = job_ptr->details->pn_min_memory & ~MEM_PER_CPU; } else if (job_ptr->details->pn_min_memory) { node_min_mem = job_ptr->details->pn_min_memory; } hl = hostlist_create(""); if (hl == NULL) fatal("hostlist_create: malloc error"); for (i = first_bit; i <= last_bit; i++) { struct node_record *node_ptr = node_record_table_ptr + i; uint32_t basil_node_id; if (!bit_test(job_ptr->job_resrcs->node_bitmap, i)) continue; if (!node_ptr->name || node_ptr->name[0] == '\0') continue; /* bad node */ if (sscanf(node_ptr->name, "nid%05u", &basil_node_id) != 1) fatal("can not read basil_node_id from %s", node_ptr->name); if (ns_add_node(&ns_head, basil_node_id) != 0) { error("can not add node %s (nid%05u)", node_ptr->name, basil_node_id); free_nodespec(ns_head); return SLURM_ERROR; } if (node_min_mem) { uint32_t node_cpus, node_mem; if (slurmctld_conf.fast_schedule) { node_cpus = node_ptr->config_ptr->cpus; node_mem = node_ptr->config_ptr->real_memory; } else { node_cpus = node_ptr->cpus; node_mem = node_ptr->real_memory; } /* * ALPS 'Processing Elements per Node' value (aprun -N), * which in slurm is --ntasks-per-node and 'mppnppn' in * PBS: if --ntasks is specified, default to the number * of cores per node (also the default for 'aprun -N'). */ node_mem /= mppnppn ? mppnppn : node_cpus; mppmem = node_min_mem = MIN(node_mem, node_min_mem); } } /* mppwidth */ for (i = 0; i < job_ptr->job_resrcs->nhosts; i++) { uint16_t node_tasks = job_ptr->job_resrcs->cpus[i] / mppdepth; if (mppnppn && mppnppn < node_tasks) node_tasks = mppnppn; mppwidth += node_tasks; } snprintf(batch_id, sizeof(batch_id), "%u", job_ptr->job_id); user = uid_to_string(job_ptr->user_id); rc = basil_reserve(user, batch_id, mppwidth, mppdepth, mppnppn, mppmem, ns_head); xfree(user); if (rc <= 0) { /* errno value will be resolved by select_g_job_begin() */ errno = is_transient_error(rc) ? EAGAIN : ECONNABORTED; return SLURM_ERROR; } resv_id = rc; if (_set_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_RESV_ID, &resv_id) != SLURM_SUCCESS) { /* * This is a fatal error since it means we will not be able to * confirm the reservation; no step will be able to run in it. */ error("job %u: can not set resId %u", job_ptr->job_id, resv_id); basil_release(resv_id); return SLURM_ERROR; } info("ALPS RESERVATION #%u, JobId %u: BASIL -n %d -N %d -d %d -m %d", resv_id, job_ptr->job_id, mppwidth, mppnppn, mppdepth, mppmem); return SLURM_SUCCESS; }
/** * do_basil_reserve - create a BASIL reservation. * IN job_ptr - pointer to job which has just been allocated resources * RET 0 or error code, job will abort or be requeued on failure */ extern int do_basil_reserve(struct job_record *job_ptr) { struct nodespec *ns_head = NULL; uint16_t mppwidth = 0, mppdepth, mppnppn; /* mppmem must be at least 1 for gang scheduling to work so * if you are wondering why gang scheduling isn't working you * should check your slurm.conf for DefMemPerNode */ uint32_t mppmem = 0, node_min_mem = 0; uint32_t resv_id; int i, first_bit, last_bit; long rc; char *user, batch_id[16]; struct basil_accel_param* bap; if (!job_ptr->job_resrcs || job_ptr->job_resrcs->nhosts == 0) return SLURM_SUCCESS; debug3("job #%u: %u nodes = %s, cpus=%u" , job_ptr->job_id, job_ptr->job_resrcs->nhosts, job_ptr->job_resrcs->nodes, job_ptr->job_resrcs->ncpus ); if (job_ptr->job_resrcs->node_bitmap == NULL) { error("job %u node_bitmap not set", job_ptr->job_id); return SLURM_SUCCESS; } first_bit = bit_ffs(job_ptr->job_resrcs->node_bitmap); last_bit = bit_fls(job_ptr->job_resrcs->node_bitmap); if (first_bit == -1 || last_bit == -1) return SLURM_SUCCESS; /* no nodes allocated */ mppdepth = MAX(1, job_ptr->details->cpus_per_task); mppnppn = job_ptr->details->ntasks_per_node; /* mppmem */ if (job_ptr->details->pn_min_memory & MEM_PER_CPU) { /* Only honour --mem-per-cpu if --ntasks has been given */ if (job_ptr->details->num_tasks) mppmem = job_ptr->details->pn_min_memory & ~MEM_PER_CPU; } else if (job_ptr->details->pn_min_memory) { node_min_mem = job_ptr->details->pn_min_memory; } for (i = first_bit; i <= last_bit; i++) { struct node_record *node_ptr = node_record_table_ptr + i; uint32_t basil_node_id; if (!bit_test(job_ptr->job_resrcs->node_bitmap, i)) continue; if (!node_ptr->name || node_ptr->name[0] == '\0') continue; /* bad node */ if (sscanf(node_ptr->name, "nid%05u", &basil_node_id) != 1) fatal("can not read basil_node_id from %s", node_ptr->name); if (ns_add_node(&ns_head, basil_node_id, false) != 0) { error("can not add node %s (nid%05u)", node_ptr->name, basil_node_id); free_nodespec(ns_head); return SLURM_ERROR; } if (node_min_mem) { uint32_t node_cpus, node_mem; int32_t tmp_mppmem; if (slurmctld_conf.fast_schedule) { node_cpus = node_ptr->config_ptr->cpus; node_mem = node_ptr->config_ptr->real_memory; } else { node_cpus = node_ptr->cpus; node_mem = node_ptr->real_memory; } /* * ALPS 'Processing Elements per Node' value (aprun -N), * which in slurm is --ntasks-per-node and 'mppnppn' in * PBS: if --ntasks is specified, default to the number * of cores per node (also the default for 'aprun -N'). * On a heterogeneous system the nodes aren't * always the same so keep track of the lowest * mppmem and use it as the level for all * nodes (mppmem is 0 when coming in). */ node_mem /= mppnppn ? mppnppn : node_cpus; tmp_mppmem = node_min_mem = MIN(node_mem, node_min_mem); /* If less than or equal to 0 make sure you have 1 at least since 0 means give all the memory to the job. */ if (tmp_mppmem <= 0) tmp_mppmem = 1; if (mppmem) mppmem = MIN(mppmem, tmp_mppmem); else mppmem = tmp_mppmem; } } /* mppwidth */ for (i = 0; i < job_ptr->job_resrcs->nhosts; i++) { uint16_t node_tasks = job_ptr->job_resrcs->cpus[i] / mppdepth; if (mppnppn && mppnppn < node_tasks) node_tasks = mppnppn; mppwidth += node_tasks; } snprintf(batch_id, sizeof(batch_id), "%u", job_ptr->job_id); user = uid_to_string(job_ptr->user_id); if (job_ptr->gres_list) bap = build_accel_param(job_ptr); else bap = NULL; rc = basil_reserve(user, batch_id, mppwidth, mppdepth, mppnppn, mppmem, ns_head, bap); xfree(user); if (rc <= 0) { /* errno value will be resolved by select_g_job_begin() */ errno = is_transient_error(rc) ? EAGAIN : ECONNABORTED; return SLURM_ERROR; } resv_id = rc; if (_set_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_RESV_ID, &resv_id) != SLURM_SUCCESS) { /* * This is a fatal error since it means we will not be able to * confirm the reservation; no step will be able to run in it. */ error("job %u: can not set resId %u", job_ptr->job_id, resv_id); basil_release(resv_id); return SLURM_ERROR; } if (mppmem) job_ptr->details->pn_min_memory = mppmem | MEM_PER_CPU; info("ALPS RESERVATION #%u, JobId %u: BASIL -n %d -N %d -d %d -m %d", resv_id, job_ptr->job_id, mppwidth, mppnppn, mppdepth, mppmem); return SLURM_SUCCESS; }