/** * do_basil_confirm - confirm an existing BASIL reservation. * This requires the alloc_sid to equal the session ID (getsid()) of the process * executing the aprun/mpirun commands * Returns: SLURM_SUCCESS if ok, READY_JOB_ERROR/FATAL on transient/fatal error. */ extern int do_basil_confirm(struct job_record *job_ptr) { uint8_t confirmed; uint32_t resv_id; uint64_t pagg_id; if (_get_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_CONFIRMED, &confirmed) != SLURM_SUCCESS) { error("can not read confirmed for JobId=%u", job_ptr->job_id); } else if (confirmed != 0) { debug2("ALPS reservation for JobId %u previously confirmed", job_ptr->job_id); return SLURM_SUCCESS; } if (_get_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_RESV_ID, &resv_id) != SLURM_SUCCESS) { error("can not read resId for JobId=%u", job_ptr->job_id); } else if (resv_id == 0) { /* On Cray XT/XE, a reservation ID of 0 is always invalid. */ error("JobId=%u has invalid (ZERO) resId", job_ptr->job_id); } else if (_get_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_PAGG_ID, &pagg_id) != SLURM_SUCCESS) { error("can not read pagg ID for JobId=%u", job_ptr->job_id); } else { int rc; if (pagg_id == 0) { #ifdef HAVE_REAL_CRAY /* This fallback case is for interactive jobs only */ error("JobId %u has no pagg ID, falling back to SID", job_ptr->job_id); #endif pagg_id = job_ptr->alloc_sid; } rc = basil_confirm(resv_id, job_ptr->job_id, pagg_id); if (rc == 0) { debug2("confirmed ALPS resId %u for JobId %u, pagg " "%"PRIu64"", resv_id, job_ptr->job_id, pagg_id); confirmed = 1; _set_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_CONFIRMED, &confirmed); return SLURM_SUCCESS; } else if (rc == -BE_NO_RESID) { /* * If ALPS can not find the reservation ID we are trying * to confirm, it may be that the job has already been * canceled, or that the reservation has timed out after * waiting for the confirmation. * It is more likely that this error occurs on a per-job * basis, hence in this case do not drain frontend node. */ error("JobId %u has invalid ALPS resId %u - job " "already canceled?", job_ptr->job_id, resv_id); return SLURM_SUCCESS; } else { error("confirming ALPS resId %u of JobId %u FAILED: %s", resv_id, job_ptr->job_id, basil_strerror(rc)); if (is_transient_error(rc)) return READY_JOB_ERROR; } } return READY_JOB_FATAL; }
/** * do_basil_reserve - create a BASIL reservation. * IN job_ptr - pointer to job which has just been allocated resources * RET 0 or error code, job will abort or be requeued on failure */ extern int do_basil_reserve(struct job_record *job_ptr) { struct nodespec *ns_head = NULL; uint16_t mppwidth = 0, mppdepth, mppnppn; uint32_t mppmem = 0, node_min_mem = 0; uint32_t resv_id; int i, first_bit, last_bit; hostlist_t hl; long rc; char *user, batch_id[16]; if (!job_ptr->job_resrcs || job_ptr->job_resrcs->nhosts == 0) return SLURM_SUCCESS; debug3("job #%u: %u nodes = %s, cpus=%u" , job_ptr->job_id, job_ptr->job_resrcs->nhosts, job_ptr->job_resrcs->nodes, job_ptr->job_resrcs->ncpus ); if (job_ptr->job_resrcs->node_bitmap == NULL) { error("job %u node_bitmap not set", job_ptr->job_id); return SLURM_SUCCESS; } first_bit = bit_ffs(job_ptr->job_resrcs->node_bitmap); last_bit = bit_fls(job_ptr->job_resrcs->node_bitmap); if (first_bit == -1 || last_bit == -1) return SLURM_SUCCESS; /* no nodes allocated */ mppdepth = MAX(1, job_ptr->details->cpus_per_task); mppnppn = job_ptr->details->ntasks_per_node; /* mppmem */ if (job_ptr->details->pn_min_memory & MEM_PER_CPU) { /* Only honour --mem-per-cpu if --ntasks has been given */ if (job_ptr->details->num_tasks) mppmem = job_ptr->details->pn_min_memory & ~MEM_PER_CPU; } else if (job_ptr->details->pn_min_memory) { node_min_mem = job_ptr->details->pn_min_memory; } hl = hostlist_create(""); if (hl == NULL) fatal("hostlist_create: malloc error"); for (i = first_bit; i <= last_bit; i++) { struct node_record *node_ptr = node_record_table_ptr + i; uint32_t basil_node_id; if (!bit_test(job_ptr->job_resrcs->node_bitmap, i)) continue; if (!node_ptr->name || node_ptr->name[0] == '\0') continue; /* bad node */ if (sscanf(node_ptr->name, "nid%05u", &basil_node_id) != 1) fatal("can not read basil_node_id from %s", node_ptr->name); if (ns_add_node(&ns_head, basil_node_id) != 0) { error("can not add node %s (nid%05u)", node_ptr->name, basil_node_id); free_nodespec(ns_head); return SLURM_ERROR; } if (node_min_mem) { uint32_t node_cpus, node_mem; if (slurmctld_conf.fast_schedule) { node_cpus = node_ptr->config_ptr->cpus; node_mem = node_ptr->config_ptr->real_memory; } else { node_cpus = node_ptr->cpus; node_mem = node_ptr->real_memory; } /* * ALPS 'Processing Elements per Node' value (aprun -N), * which in slurm is --ntasks-per-node and 'mppnppn' in * PBS: if --ntasks is specified, default to the number * of cores per node (also the default for 'aprun -N'). */ node_mem /= mppnppn ? mppnppn : node_cpus; mppmem = node_min_mem = MIN(node_mem, node_min_mem); } } /* mppwidth */ for (i = 0; i < job_ptr->job_resrcs->nhosts; i++) { uint16_t node_tasks = job_ptr->job_resrcs->cpus[i] / mppdepth; if (mppnppn && mppnppn < node_tasks) node_tasks = mppnppn; mppwidth += node_tasks; } snprintf(batch_id, sizeof(batch_id), "%u", job_ptr->job_id); user = uid_to_string(job_ptr->user_id); rc = basil_reserve(user, batch_id, mppwidth, mppdepth, mppnppn, mppmem, ns_head); xfree(user); if (rc <= 0) { /* errno value will be resolved by select_g_job_begin() */ errno = is_transient_error(rc) ? EAGAIN : ECONNABORTED; return SLURM_ERROR; } resv_id = rc; if (_set_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_RESV_ID, &resv_id) != SLURM_SUCCESS) { /* * This is a fatal error since it means we will not be able to * confirm the reservation; no step will be able to run in it. */ error("job %u: can not set resId %u", job_ptr->job_id, resv_id); basil_release(resv_id); return SLURM_ERROR; } info("ALPS RESERVATION #%u, JobId %u: BASIL -n %d -N %d -d %d -m %d", resv_id, job_ptr->job_id, mppwidth, mppnppn, mppdepth, mppmem); return SLURM_SUCCESS; }
/** * do_basil_reserve - create a BASIL reservation. * IN job_ptr - pointer to job which has just been allocated resources * RET 0 or error code, job will abort or be requeued on failure */ extern int do_basil_reserve(struct job_record *job_ptr) { struct nodespec *ns_head = NULL; uint16_t mppwidth = 0, mppdepth, mppnppn; /* mppmem must be at least 1 for gang scheduling to work so * if you are wondering why gang scheduling isn't working you * should check your slurm.conf for DefMemPerNode */ uint32_t mppmem = 0, node_min_mem = 0; uint32_t resv_id; int i, first_bit, last_bit; long rc; char *user, batch_id[16]; struct basil_accel_param* bap; if (!job_ptr->job_resrcs || job_ptr->job_resrcs->nhosts == 0) return SLURM_SUCCESS; debug3("job #%u: %u nodes = %s, cpus=%u" , job_ptr->job_id, job_ptr->job_resrcs->nhosts, job_ptr->job_resrcs->nodes, job_ptr->job_resrcs->ncpus ); if (job_ptr->job_resrcs->node_bitmap == NULL) { error("job %u node_bitmap not set", job_ptr->job_id); return SLURM_SUCCESS; } first_bit = bit_ffs(job_ptr->job_resrcs->node_bitmap); last_bit = bit_fls(job_ptr->job_resrcs->node_bitmap); if (first_bit == -1 || last_bit == -1) return SLURM_SUCCESS; /* no nodes allocated */ mppdepth = MAX(1, job_ptr->details->cpus_per_task); mppnppn = job_ptr->details->ntasks_per_node; /* mppmem */ if (job_ptr->details->pn_min_memory & MEM_PER_CPU) { /* Only honour --mem-per-cpu if --ntasks has been given */ if (job_ptr->details->num_tasks) mppmem = job_ptr->details->pn_min_memory & ~MEM_PER_CPU; } else if (job_ptr->details->pn_min_memory) { node_min_mem = job_ptr->details->pn_min_memory; } for (i = first_bit; i <= last_bit; i++) { struct node_record *node_ptr = node_record_table_ptr + i; uint32_t basil_node_id; if (!bit_test(job_ptr->job_resrcs->node_bitmap, i)) continue; if (!node_ptr->name || node_ptr->name[0] == '\0') continue; /* bad node */ if (sscanf(node_ptr->name, "nid%05u", &basil_node_id) != 1) fatal("can not read basil_node_id from %s", node_ptr->name); if (ns_add_node(&ns_head, basil_node_id, false) != 0) { error("can not add node %s (nid%05u)", node_ptr->name, basil_node_id); free_nodespec(ns_head); return SLURM_ERROR; } if (node_min_mem) { uint32_t node_cpus, node_mem; int32_t tmp_mppmem; if (slurmctld_conf.fast_schedule) { node_cpus = node_ptr->config_ptr->cpus; node_mem = node_ptr->config_ptr->real_memory; } else { node_cpus = node_ptr->cpus; node_mem = node_ptr->real_memory; } /* * ALPS 'Processing Elements per Node' value (aprun -N), * which in slurm is --ntasks-per-node and 'mppnppn' in * PBS: if --ntasks is specified, default to the number * of cores per node (also the default for 'aprun -N'). * On a heterogeneous system the nodes aren't * always the same so keep track of the lowest * mppmem and use it as the level for all * nodes (mppmem is 0 when coming in). */ node_mem /= mppnppn ? mppnppn : node_cpus; tmp_mppmem = node_min_mem = MIN(node_mem, node_min_mem); /* If less than or equal to 0 make sure you have 1 at least since 0 means give all the memory to the job. */ if (tmp_mppmem <= 0) tmp_mppmem = 1; if (mppmem) mppmem = MIN(mppmem, tmp_mppmem); else mppmem = tmp_mppmem; } } /* mppwidth */ for (i = 0; i < job_ptr->job_resrcs->nhosts; i++) { uint16_t node_tasks = job_ptr->job_resrcs->cpus[i] / mppdepth; if (mppnppn && mppnppn < node_tasks) node_tasks = mppnppn; mppwidth += node_tasks; } snprintf(batch_id, sizeof(batch_id), "%u", job_ptr->job_id); user = uid_to_string(job_ptr->user_id); if (job_ptr->gres_list) bap = build_accel_param(job_ptr); else bap = NULL; rc = basil_reserve(user, batch_id, mppwidth, mppdepth, mppnppn, mppmem, ns_head, bap); xfree(user); if (rc <= 0) { /* errno value will be resolved by select_g_job_begin() */ errno = is_transient_error(rc) ? EAGAIN : ECONNABORTED; return SLURM_ERROR; } resv_id = rc; if (_set_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_RESV_ID, &resv_id) != SLURM_SUCCESS) { /* * This is a fatal error since it means we will not be able to * confirm the reservation; no step will be able to run in it. */ error("job %u: can not set resId %u", job_ptr->job_id, resv_id); basil_release(resv_id); return SLURM_ERROR; } if (mppmem) job_ptr->details->pn_min_memory = mppmem | MEM_PER_CPU; info("ALPS RESERVATION #%u, JobId %u: BASIL -n %d -N %d -d %d -m %d", resv_id, job_ptr->job_id, mppwidth, mppnppn, mppdepth, mppmem); return SLURM_SUCCESS; }