/* * slurm_load_jobs - issue RPC to get all job configuration * information if changed since update_time * IN update_time - time of current configuration data * IN/OUT job_info_msg_pptr - place to store a job configuration pointer * IN show_flags - job filtering option: 0, SHOW_ALL or SHOW_DETAIL * RET 0 or -1 on error * NOTE: free the response using slurm_free_job_info_msg */ extern int slurm_load_jobs (time_t update_time, job_info_msg_t **job_info_msg_pptr, uint16_t show_flags) { int rc; slurm_msg_t resp_msg; slurm_msg_t req_msg; job_info_request_msg_t req; slurm_msg_t_init(&req_msg); slurm_msg_t_init(&resp_msg); req.last_update = update_time; req.show_flags = show_flags; req_msg.msg_type = REQUEST_JOB_INFO; req_msg.data = &req; if (slurm_send_recv_controller_msg(&req_msg, &resp_msg) < 0) return SLURM_ERROR; switch (resp_msg.msg_type) { case RESPONSE_JOB_INFO: *job_info_msg_pptr = (job_info_msg_t *)resp_msg.data; break; case RESPONSE_SLURM_RC: rc = ((return_code_msg_t *) resp_msg.data)->return_code; slurm_free_return_code_msg(resp_msg.data); if (rc) slurm_seterrno_ret(rc); break; default: slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR); break; } return SLURM_PROTOCOL_SUCCESS; }
/* * slurm_load_job - issue RPC to get job information for one job ID * IN job_info_msg_pptr - place to store a job configuration pointer * IN job_id - ID of job we want information about * IN show_flags - job filtering option: 0, SHOW_ALL or SHOW_DETAIL * RET 0 or -1 on error * NOTE: free the response using slurm_free_job_info_msg */ extern int slurm_load_job (job_info_msg_t **resp, uint32_t job_id, uint16_t show_flags) { int rc; slurm_msg_t resp_msg; slurm_msg_t req_msg; job_id_msg_t req; slurm_msg_t_init(&req_msg); slurm_msg_t_init(&resp_msg); bzero(&req, sizeof(job_id_msg_t)); req.job_id = job_id; req.show_flags = show_flags; req_msg.msg_type = REQUEST_JOB_INFO_SINGLE; req_msg.data = &req; if (slurm_send_recv_controller_msg(&req_msg, &resp_msg) < 0) return SLURM_ERROR; switch (resp_msg.msg_type) { case RESPONSE_JOB_INFO: *resp = (job_info_msg_t *)resp_msg.data; break; case RESPONSE_SLURM_RC: rc = ((return_code_msg_t *) resp_msg.data)->return_code; slurm_free_return_code_msg(resp_msg.data); if (rc) slurm_seterrno_ret(rc); break; default: slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR); break; } return SLURM_PROTOCOL_SUCCESS ; }
/* * slurm_load_node_single - issue RPC to get slurm configuration information * for a specific node * OUT resp - place to store a node configuration pointer * IN node_name - name of the node for which information is requested * IN show_flags - node filtering options * RET 0 or a slurm error code * NOTE: free the response using slurm_free_node_info_msg */ extern int slurm_load_node_single (node_info_msg_t **resp, char *node_name, uint16_t show_flags) { int rc; slurm_msg_t req_msg; slurm_msg_t resp_msg; node_info_single_msg_t req; slurm_msg_t_init(&req_msg); slurm_msg_t_init(&resp_msg); req.node_name = node_name; req.show_flags = show_flags; req_msg.msg_type = REQUEST_NODE_INFO_SINGLE; req_msg.data = &req; if (slurm_send_recv_controller_msg(&req_msg, &resp_msg) < 0) return SLURM_ERROR; switch (resp_msg.msg_type) { case RESPONSE_NODE_INFO: *resp = (node_info_msg_t *) resp_msg.data; break; case RESPONSE_SLURM_RC: rc = ((return_code_msg_t *) resp_msg.data)->return_code; slurm_free_return_code_msg(resp_msg.data); if (rc) slurm_seterrno_ret(rc); *resp = NULL; break; default: slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR); break; } return SLURM_PROTOCOL_SUCCESS; }
/* * slurm_load_reservations - issue RPC to get all slurm reservation * configuration information if changed since update_time * IN update_time - time of current configuration data * IN reserve_info_msg_pptr - place to store a reservation configuration * pointer * RET 0 or a slurm error code * NOTE: free the response using slurm_free_reservation_info_msg */ extern int slurm_load_reservations (time_t update_time, reserve_info_msg_t **resp) { int rc; slurm_msg_t req_msg; slurm_msg_t resp_msg; resv_info_request_msg_t req; slurm_msg_t_init(&req_msg); slurm_msg_t_init(&resp_msg); req.last_update = update_time; req_msg.msg_type = REQUEST_RESERVATION_INFO; req_msg.data = &req; if (slurm_send_recv_controller_msg(&req_msg, &resp_msg) < 0) return SLURM_ERROR; switch (resp_msg.msg_type) { case RESPONSE_RESERVATION_INFO: *resp = (reserve_info_msg_t *) resp_msg.data; break; case RESPONSE_SLURM_RC: rc = ((return_code_msg_t *) resp_msg.data)->return_code; slurm_free_return_code_msg(resp_msg.data); if (rc) slurm_seterrno_ret(rc); *resp = NULL; break; default: slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR); break; } return SLURM_PROTOCOL_SUCCESS; }
int _send_message_controller (enum controller_id dest, slurm_msg_t *req) { int rc = SLURM_PROTOCOL_SUCCESS; slurm_fd_t fd = -1; slurm_msg_t *resp_msg = NULL; /* always going to one node (primary or backup per value of "dest") */ if ((fd = slurm_open_controller_conn_spec(dest)) < 0) slurm_seterrno_ret(SLURMCTLD_COMMUNICATIONS_CONNECTION_ERROR); if (slurm_send_node_msg(fd, req) < 0) { slurm_shutdown_msg_conn(fd); slurm_seterrno_ret(SLURMCTLD_COMMUNICATIONS_SEND_ERROR); } resp_msg = xmalloc(sizeof(slurm_msg_t)); slurm_msg_t_init(resp_msg); if((rc = slurm_receive_msg(fd, resp_msg, 0)) != 0) { slurm_shutdown_msg_conn(fd); return SLURMCTLD_COMMUNICATIONS_RECEIVE_ERROR; } if (slurm_shutdown_msg_conn(fd) != SLURM_SUCCESS) rc = SLURMCTLD_COMMUNICATIONS_SHUTDOWN_ERROR; else if (resp_msg->msg_type != RESPONSE_SLURM_RC) rc = SLURM_UNEXPECTED_MSG_ERROR; else rc = slurm_get_return_code(resp_msg->msg_type, resp_msg->data); slurm_free_msg(resp_msg); if (rc) slurm_seterrno_ret(rc); return rc; }
int slurm_job_cpus_allocated_str_on_node_id(char *cpus, size_t cpus_len, job_resources_t *job_resrcs_ptr, int node_id) { int start_node = -1; /* start with -1 less so the array reps * lines up correctly */ uint32_t threads = 1; int inx = 0; bitstr_t *cpu_bitmap; int j, k, bit_inx, bit_reps; if (!job_resrcs_ptr || node_id < 0) slurm_seterrno_ret(EINVAL); /* find index in sock_core_rep_count[] for this node id */ do { start_node += job_resrcs_ptr->sock_core_rep_count[inx]; inx++; } while (start_node < node_id); /* back to previous index since inx is always one step further * after previous loop */ inx--; bit_reps = job_resrcs_ptr->sockets_per_node[inx] * job_resrcs_ptr->cores_per_socket[inx]; /* get the number of threads per core on this node */ if (job_node_ptr) threads = job_node_ptr->node_array[node_id].threads; bit_inx = 0; cpu_bitmap = bit_alloc(bit_reps * threads); for (j = 0; j < bit_reps; j++) { if (bit_test(job_resrcs_ptr->core_bitmap, bit_inx)){ for (k = 0; k < threads; k++) bit_set(cpu_bitmap, (j * threads) + k); } bit_inx++; } bit_fmt(cpus, cpus_len, cpu_bitmap); FREE_NULL_BITMAP(cpu_bitmap); return SLURM_SUCCESS; }
extern int slurm_job_cpus_allocated_on_node(job_resources_t *job_resrcs_ptr, const char *node) { hostlist_t node_hl; int node_id; if (!job_resrcs_ptr || !node || !job_resrcs_ptr->nodes) slurm_seterrno_ret(EINVAL); node_hl = hostlist_create(job_resrcs_ptr->nodes); node_id = hostlist_find(node_hl, node); hostlist_destroy(node_hl); if (node_id == -1) return (0); /* No cpus allocated on this node */ return slurm_job_cpus_allocated_on_node_id(job_resrcs_ptr, node_id); }
/* _slurm_update - issue RPC for all update requests */ static int _slurm_update (void *data, slurm_msg_type_t msg_type) { int rc; slurm_msg_t req_msg; slurm_msg_t_init(&req_msg); req_msg.msg_type = msg_type; req_msg.data = data; if (slurm_send_recv_controller_rc_msg(&req_msg, &rc) < 0) return SLURM_ERROR; if (rc != SLURM_SUCCESS) slurm_seterrno_ret(rc); return SLURM_PROTOCOL_SUCCESS; }
/* * slurm_reconfigure - issue RPC to have Slurm controller (slurmctld) * reload its configuration file * RET 0 or a slurm error code */ int slurm_reconfigure (void) { int rc; slurm_msg_t req; slurm_msg_t_init(&req); req.msg_type = REQUEST_RECONFIGURE; if (slurm_send_recv_controller_rc_msg(&req, &rc) < 0) return SLURM_ERROR; if (rc) slurm_seterrno_ret(rc); return SLURM_PROTOCOL_SUCCESS; }
/* * slurm_job_step_create - create a job step for a given job id * IN slurm_step_alloc_req_msg - description of job step request * OUT slurm_step_alloc_resp_msg - response to request * RET 0 on success, otherwise return -1 and set errno to indicate the error * NOTE: free the response using slurm_free_job_step_create_response_msg */ int slurm_job_step_create (job_step_create_request_msg_t *req, job_step_create_response_msg_t **resp) { slurm_msg_t req_msg, resp_msg; int delay, rc, retry = 0; slurm_msg_t_init(&req_msg); slurm_msg_t_init(&resp_msg); req_msg.msg_type = REQUEST_JOB_STEP_CREATE; req_msg.data = req; re_send: if (slurm_send_recv_controller_msg(&req_msg, &resp_msg, working_cluster_rec) < 0) return SLURM_ERROR; switch (resp_msg.msg_type) { case RESPONSE_SLURM_RC: rc = _handle_rc_msg(&resp_msg); if ((rc < 0) && (errno == EAGAIN)) { if (retry++ == 0) { verbose("Slurm is busy, step creation delayed"); delay = (getpid() % 10) + 10; /* 10-19 secs */ } sleep(delay); goto re_send; } if (rc < 0) return SLURM_PROTOCOL_ERROR; *resp = NULL; break; case RESPONSE_JOB_STEP_CREATE: *resp = (job_step_create_response_msg_t *) resp_msg.data; break; default: slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR); break; } return SLURM_PROTOCOL_SUCCESS ; }
/* slurm_load_licenses() * * Load requested licenses from the controller. * */ extern int slurm_load_licenses(time_t t, license_info_msg_t **lic_info, uint16_t show_flags) { int cc; slurm_msg_t msg_request; slurm_msg_t msg_reply; struct license_info_request_msg req; memset(&req, 0, sizeof(struct license_info_request_msg)); slurm_msg_t_init(&msg_request); slurm_msg_t_init(&msg_reply); msg_request.msg_type = REQUEST_LICENSE_INFO; req.last_update = t; req.show_flags = show_flags; msg_request.data = &req; cc = slurm_send_recv_controller_msg(&msg_request, &msg_reply, working_cluster_rec); if (cc < 0) return SLURM_ERROR; switch (msg_reply.msg_type) { case RESPONSE_LICENSE_INFO: *lic_info = msg_reply.data; break; case RESPONSE_SLURM_RC: cc = ((return_code_msg_t *)msg_reply.data)->return_code; slurm_free_return_code_msg(msg_reply.data); if (cc) /* slurm_seterrno_ret() is a macro ... sigh */ slurm_seterrno(cc); *lic_info = NULL; return -1; default: slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR); break; } return SLURM_PROTOCOL_SUCCESS; }
extern int slurm_job_cpus_allocated_on_node_id( job_resources_t *job_resrcs_ptr, int node_id) { int i; int start_node=-1; /* start with -1 less so the array reps * lines up correctly */ if (!job_resrcs_ptr || node_id < 0) slurm_seterrno_ret(EINVAL); for (i = 0; i < job_resrcs_ptr->cpu_array_cnt; i++) { start_node += job_resrcs_ptr->cpu_array_reps[i]; if (start_node >= node_id) break; } if (i >= job_resrcs_ptr->cpu_array_cnt) return (0); /* nodeid not in this job */ return job_resrcs_ptr->cpu_array_value[i]; }
/* * slurm_allocate_resources - allocate resources for a job request * IN job_desc_msg - description of resource allocation request * OUT slurm_alloc_msg - response to request * RET SLURM_SUCCESS on success, otherwise return SLURM_ERROR with errno set * NOTE: free the response using slurm_free_resource_allocation_response_msg() */ int slurm_allocate_resources (job_desc_msg_t *req, resource_allocation_response_msg_t **resp) { int rc; slurm_msg_t req_msg; slurm_msg_t resp_msg; slurm_msg_t_init(&req_msg); slurm_msg_t_init(&resp_msg); /* * set Node and session id for this request */ if (req->alloc_sid == NO_VAL) req->alloc_sid = getsid(0); req_msg.msg_type = REQUEST_RESOURCE_ALLOCATION; req_msg.data = req; rc = slurm_send_recv_controller_msg(&req_msg, &resp_msg, working_cluster_rec); if (rc == SLURM_ERROR) return SLURM_ERROR; switch (resp_msg.msg_type) { case RESPONSE_SLURM_RC: if (_handle_rc_msg(&resp_msg) < 0) return SLURM_ERROR; *resp = NULL; break; case RESPONSE_RESOURCE_ALLOCATION: *resp = (resource_allocation_response_msg_t *) resp_msg.data; break; default: slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR); } return SLURM_SUCCESS; }
int slurm_job_cpus_allocated_str_on_node(char *cpus, size_t cpus_len, job_resources_t *job_resrcs_ptr, const char *node) { hostlist_t node_hl; int node_id; if (!job_resrcs_ptr || !node || !job_resrcs_ptr->nodes) slurm_seterrno_ret(EINVAL); node_hl = hostlist_create(job_resrcs_ptr->nodes); node_id = hostlist_find(node_hl, node); hostlist_destroy(node_hl); if (node_id == -1) return SLURM_ERROR; return slurm_job_cpus_allocated_str_on_node_id(cpus, cpus_len, job_resrcs_ptr, node_id); }
int optz_add(struct option **optz, const struct option *opt) { int len = 0; struct option *op = *optz; struct option *t = *optz; for (; op->name != NULL; op++) { if (strcmp(op->name, opt->name) == 0) slurm_seterrno_ret(EEXIST); len++; } ++len; /* Add one for incoming option */ t = xrealloc(t, (len + 1) * sizeof(struct option)); t[len - 1] = *opt; t[len] = opt_table_end; *optz = t; return (0); }
/* * slurm_clear_trigger - Clear (remove) an existing event trigger * RET 0 or a slurm error code */ extern int slurm_clear_trigger (trigger_info_t *trigger_clear) { int rc; slurm_msg_t msg; trigger_info_msg_t req; slurm_msg_t_init(&msg); /* * Request message: */ req.record_count = 1; req.trigger_array = trigger_clear; msg.msg_type = REQUEST_TRIGGER_CLEAR; msg.data = &req; if (slurm_send_recv_controller_rc_msg(&msg, &rc) < 0) return SLURM_FAILURE; if (rc) slurm_seterrno_ret(rc); return SLURM_SUCCESS; }
/* * slurm_pull_trigger - Pull (fire) an event trigger * RET 0 or a slurm error code */ extern int slurm_pull_trigger (trigger_info_t *trigger_pull) { int rc; slurm_msg_t msg; trigger_info_msg_t req; /* * Request message: */ slurm_msg_t_init(&msg); memset(&req, 0, sizeof(trigger_info_msg_t)); req.record_count = 1; req.trigger_array = trigger_pull; msg.msg_type = REQUEST_TRIGGER_PULL; msg.data = &req; if (slurm_send_recv_controller_rc_msg(&msg, &rc) < 0) return SLURM_FAILURE; if (rc) slurm_seterrno_ret(rc); return SLURM_SUCCESS; }
/* * slurm_checkpoint_complete - note the completion of a job step's checkpoint * operation. * IN job_id - job on which to perform operation * IN step_id - job step on which to perform operation * IN begin_time - time at which checkpoint began * IN error_code - error code, highest value for all complete calls is preserved * IN error_msg - error message, preserved for highest error_code * RET 0 or a slurm error code */ extern int slurm_checkpoint_complete (uint32_t job_id, uint32_t step_id, time_t begin_time, uint32_t error_code, char *error_msg) { int rc; slurm_msg_t msg; checkpoint_comp_msg_t req; slurm_msg_t_init(&msg); req.job_id = job_id; req.step_id = step_id; req.begin_time = begin_time; req.error_code = error_code; req.error_msg = error_msg; msg.msg_type = REQUEST_CHECKPOINT_COMP; msg.data = &req; if (slurm_send_recv_controller_rc_msg(&msg, &rc, working_cluster_rec) < 0) return SLURM_ERROR; if (rc) slurm_seterrno_ret(rc); return SLURM_SUCCESS; }
/* * slurm_allocation_lookup - retrieve info for an existing resource allocation * without the addrs and such * IN jobid - job allocation identifier * OUT info - job allocation information * RET 0 on success, otherwise return -1 and set errno to indicate the error * NOTE: free the response using slurm_free_resource_allocation_response_msg() */ extern int slurm_allocation_lookup(uint32_t jobid, resource_allocation_response_msg_t **info) { job_alloc_info_msg_t req = {0}; slurm_msg_t req_msg; slurm_msg_t resp_msg; req.job_id = jobid; req.req_cluster = slurmctld_conf.cluster_name; slurm_msg_t_init(&req_msg); slurm_msg_t_init(&resp_msg); req_msg.msg_type = REQUEST_JOB_ALLOCATION_INFO; req_msg.data = &req; if (slurm_send_recv_controller_msg(&req_msg, &resp_msg, working_cluster_rec) < 0) return SLURM_ERROR; req.req_cluster = NULL; switch (resp_msg.msg_type) { case RESPONSE_SLURM_RC: if (_handle_rc_msg(&resp_msg) < 0) return SLURM_ERROR; *info = NULL; break; case RESPONSE_JOB_ALLOCATION_INFO: *info = (resource_allocation_response_msg_t *) resp_msg.data; return SLURM_PROTOCOL_SUCCESS; break; default: slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR); break; } return SLURM_PROTOCOL_SUCCESS; }
/* * slurm_sbcast_lookup - retrieve info for an existing resource allocation * including a credential needed for sbcast * IN job_id - job allocation identifier (or pack job ID) * IN pack_job_offset - pack job index (or NO_VAL if not pack job) * IN step_id - step allocation identifier (or NO_VAL for entire job) * OUT info - job allocation information including a credential for sbcast * RET SLURM_SUCCESS on success, otherwise return SLURM_ERROR with errno set * NOTE: free the "resp" using slurm_free_sbcast_cred_msg */ extern int slurm_sbcast_lookup(uint32_t job_id, uint32_t pack_job_offset, uint32_t step_id, job_sbcast_cred_msg_t **info) { step_alloc_info_msg_t req; slurm_msg_t req_msg; slurm_msg_t resp_msg; req.job_id = job_id; req.pack_job_offset = pack_job_offset; req.step_id = step_id; slurm_msg_t_init(&req_msg); slurm_msg_t_init(&resp_msg); req_msg.msg_type = REQUEST_JOB_SBCAST_CRED; req_msg.data = &req; if (slurm_send_recv_controller_msg(&req_msg, &resp_msg,working_cluster_rec) < 0) return SLURM_ERROR; switch (resp_msg.msg_type) { case RESPONSE_SLURM_RC: if (_handle_rc_msg(&resp_msg) < 0) return SLURM_ERROR; *info = NULL; break; case RESPONSE_JOB_SBCAST_CRED: *info = (job_sbcast_cred_msg_t *)resp_msg.data; return SLURM_SUCCESS; break; default: slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR); break; } return SLURM_SUCCESS; }
/* * slurm_job_will_run - determine if a job would execute immediately if * submitted now * IN job_desc_msg - description of resource allocation request * RET 0 on success, otherwise return -1 and set errno to indicate the error */ int slurm_job_will_run (job_desc_msg_t *req) { slurm_msg_t req_msg, resp_msg; will_run_response_msg_t *will_run_resp; char buf[64]; bool host_set = false; int rc; uint32_t cluster_flags = slurmdb_setup_cluster_flags(); char *type = "processors"; /* req.immediate = true; implicit */ if ((req->alloc_node == NULL) && (gethostname_short(buf, sizeof(buf)) == 0)) { req->alloc_node = buf; host_set = true; } slurm_msg_t_init(&req_msg); req_msg.msg_type = REQUEST_JOB_WILL_RUN; req_msg.data = req; rc = slurm_send_recv_controller_msg(&req_msg, &resp_msg); if (host_set) req->alloc_node = NULL; if (rc < 0) return SLURM_SOCKET_ERROR; switch (resp_msg.msg_type) { case RESPONSE_SLURM_RC: if (_handle_rc_msg(&resp_msg) < 0) return SLURM_PROTOCOL_ERROR; break; case RESPONSE_JOB_WILL_RUN: if(cluster_flags & CLUSTER_FLAG_BG) type = "cnodes"; will_run_resp = (will_run_response_msg_t *) resp_msg.data; slurm_make_time_str(&will_run_resp->start_time, buf, sizeof(buf)); info("Job %u to start at %s using %u %s" " on %s", will_run_resp->job_id, buf, will_run_resp->proc_cnt, type, will_run_resp->node_list); if (will_run_resp->preemptee_job_id) { ListIterator itr; uint32_t *job_id_ptr; char *job_list = NULL, *sep = ""; itr = list_iterator_create(will_run_resp-> preemptee_job_id); while ((job_id_ptr = list_next(itr))) { if (job_list) sep = ","; xstrfmtcat(job_list, "%s%u", sep, *job_id_ptr); } info(" Preempts: %s", job_list); xfree(job_list); } slurm_free_will_run_response_msg(will_run_resp); break; default: slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR); break; } return SLURM_PROTOCOL_SUCCESS; }
/* * slurm_load_slurmd_status - issue RPC to get the status of slurmd * daemon on this machine * IN slurmd_info_ptr - place to store slurmd status information * RET 0 or -1 on error * NOTE: free the response using slurm_free_slurmd_status() */ extern int slurm_load_slurmd_status(slurmd_status_t **slurmd_status_ptr) { int rc; slurm_msg_t req_msg; slurm_msg_t resp_msg; uint32_t cluster_flags = slurmdb_setup_cluster_flags(); char *this_addr; slurm_msg_t_init(&req_msg); slurm_msg_t_init(&resp_msg); if (cluster_flags & CLUSTER_FLAG_MULTSD) { if ((this_addr = getenv("SLURMD_NODENAME"))) { slurm_conf_get_addr(this_addr, &req_msg.address); } else { this_addr = "localhost"; slurm_set_addr(&req_msg.address, (uint16_t)slurm_get_slurmd_port(), this_addr); } } else { char this_host[256]; /* * Set request message address to slurmd on localhost */ gethostname_short(this_host, sizeof(this_host)); this_addr = slurm_conf_get_nodeaddr(this_host); if (this_addr == NULL) this_addr = xstrdup("localhost"); slurm_set_addr(&req_msg.address, (uint16_t)slurm_get_slurmd_port(), this_addr); xfree(this_addr); } req_msg.msg_type = REQUEST_DAEMON_STATUS; req_msg.data = NULL; rc = slurm_send_recv_node_msg(&req_msg, &resp_msg, 0); if ((rc != 0) || !resp_msg.auth_cred) { error("slurm_slurmd_info: %m"); if (resp_msg.auth_cred) g_slurm_auth_destroy(resp_msg.auth_cred); return SLURM_ERROR; } if (resp_msg.auth_cred) g_slurm_auth_destroy(resp_msg.auth_cred); switch (resp_msg.msg_type) { case RESPONSE_SLURMD_STATUS: *slurmd_status_ptr = (slurmd_status_t *) resp_msg.data; break; case RESPONSE_SLURM_RC: rc = ((return_code_msg_t *) resp_msg.data)->return_code; slurm_free_return_code_msg(resp_msg.data); if (rc) slurm_seterrno_ret(rc); break; default: slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR); break; } return SLURM_PROTOCOL_SUCCESS; }
/* * check_header_version checks to see that the specified header was sent * from a node running the same version of the protocol as the current node * IN header - the message header received * RET - SLURM error code */ int check_header_version(header_t * header) { uint16_t check_version = SLURM_PROTOCOL_VERSION; if (working_cluster_rec) check_version = _get_slurm_version( working_cluster_rec->rpc_version); if (slurmdbd_conf) { if ((header->version != SLURM_PROTOCOL_VERSION) && (header->version != SLURM_2_2_PROTOCOL_VERSION) && (header->version != SLURM_2_1_PROTOCOL_VERSION)) slurm_seterrno_ret(SLURM_PROTOCOL_VERSION_ERROR); } else if (header->version != check_version) { /* Starting with 2.2 we will handle previous versions * of SLURM for some calls */ switch(header->msg_type) { case REQUEST_BLOCK_INFO: case REQUEST_BUILD_INFO: case REQUEST_CANCEL_JOB_STEP: case REQUEST_CHECKPOINT: case REQUEST_CHECKPOINT_COMP: case REQUEST_CHECKPOINT_TASK_COMP: case REQUEST_COMPLETE_BATCH_SCRIPT: /* From slurmstepd */ case REQUEST_COMPLETE_JOB_ALLOCATION: case REQUEST_CREATE_PARTITION: case REQUEST_CREATE_RESERVATION: case REQUEST_JOB_END_TIME: case REQUEST_JOB_INFO: case REQUEST_JOB_INFO_SINGLE: case REQUEST_JOB_READY: case REQUEST_JOB_REQUEUE: case REQUEST_JOB_STEP_INFO: case REQUEST_JOB_WILL_RUN: case REQUEST_NODE_INFO: case REQUEST_PARTITION_INFO: case REQUEST_PRIORITY_FACTORS: case REQUEST_RECONFIGURE: case REQUEST_RESERVATION_INFO: case REQUEST_SET_DEBUG_FLAGS: case REQUEST_SET_DEBUG_LEVEL: case REQUEST_SHARE_INFO: case REQUEST_SHUTDOWN: case REQUEST_SHUTDOWN_IMMEDIATE: case REQUEST_STEP_COMPLETE: /* From slurmstepd */ case REQUEST_STEP_LAYOUT: case REQUEST_SUBMIT_BATCH_JOB: case REQUEST_SUSPEND: case REQUEST_TOPO_INFO: case REQUEST_UPDATE_BLOCK: case REQUEST_UPDATE_JOB: case REQUEST_UPDATE_PARTITION: if ((header->version == SLURM_2_2_PROTOCOL_VERSION) || (header->version == SLURM_2_1_PROTOCOL_VERSION)) break; default: slurm_seterrno_ret(SLURM_PROTOCOL_VERSION_ERROR); break; } } return SLURM_PROTOCOL_SUCCESS; }
static int _load_fed_parts(slurm_msg_t *req_msg, partition_info_msg_t **part_info_msg_pptr, uint16_t show_flags, char *cluster_name, slurmdb_federation_rec_t *fed) { int cluster_inx = 0, i; load_part_resp_struct_t *part_resp; partition_info_msg_t *orig_msg = NULL, *new_msg = NULL; uint32_t new_rec_cnt; slurmdb_cluster_rec_t *cluster; ListIterator iter; pthread_attr_t load_attr; int pthread_count = 0; pthread_t *load_thread = 0; load_part_req_struct_t *load_args; List resp_msg_list; *part_info_msg_pptr = NULL; /* Spawn one pthread per cluster to collect partition information */ resp_msg_list = list_create(NULL); load_thread = xmalloc(sizeof(pthread_attr_t) * list_count(fed->cluster_list)); iter = list_iterator_create(fed->cluster_list); while ((cluster = (slurmdb_cluster_rec_t *) list_next(iter))) { int retries = 0; if ((cluster->control_host == NULL) || (cluster->control_host[0] == '\0')) continue; /* Cluster down */ load_args = xmalloc(sizeof(load_part_req_struct_t)); load_args->cluster = cluster; load_args->cluster_inx = cluster_inx++; load_args->req_msg = req_msg; load_args->resp_msg_list = resp_msg_list; load_args->show_flags = show_flags; slurm_attr_init(&load_attr); if (pthread_attr_setdetachstate(&load_attr, PTHREAD_CREATE_JOINABLE)) error("pthread_attr_setdetachstate error %m"); while (pthread_create(&load_thread[pthread_count], &load_attr, _load_part_thread, (void *) load_args)) { error("pthread_create error %m"); if (++retries > MAX_RETRIES) fatal("Can't create pthread"); usleep(10000); /* sleep and retry */ } pthread_count++; slurm_attr_destroy(&load_attr); } list_iterator_destroy(iter); /* Wait for all pthreads to complete */ for (i = 0; i < pthread_count; i++) pthread_join(load_thread[i], NULL); xfree(load_thread); /* Maintain a consistent cluster/node ordering */ list_sort(resp_msg_list, _sort_by_cluster_inx); /* Merge the responses into a single response message */ iter = list_iterator_create(resp_msg_list); while ((part_resp = (load_part_resp_struct_t *) list_next(iter))) { new_msg = part_resp->new_msg; if (!orig_msg) { orig_msg = new_msg; *part_info_msg_pptr = orig_msg; } else { /* Merge the node records */ orig_msg->last_update = MIN(orig_msg->last_update, new_msg->last_update); new_rec_cnt = orig_msg->record_count + new_msg->record_count; if (new_msg->record_count) { orig_msg->partition_array = xrealloc(orig_msg->partition_array, sizeof(partition_info_t) * new_rec_cnt); (void) memcpy(orig_msg->partition_array + orig_msg->record_count, new_msg->partition_array, sizeof(partition_info_t) * new_msg->record_count); orig_msg->record_count = new_rec_cnt; } xfree(new_msg->partition_array); xfree(new_msg); } xfree(part_resp); } list_iterator_destroy(iter); FREE_NULL_LIST(resp_msg_list); if (!orig_msg) slurm_seterrno_ret(SLURM_ERROR); return SLURM_PROTOCOL_SUCCESS; }
/* * slurm_get_node_energy_n - issue RPC to get the energy data of all * configured sensors on the target machine * IN host - name of node to query, NULL if localhost * IN delta - Use cache if data is newer than this in seconds * OUT sensors_cnt - number of sensors * OUT energy - array of acct_gather_energy_t structures on success or * NULL other wise * RET 0 on success or a slurm error code * NOTE: free the response using xfree */ extern int slurm_get_node_energy(char *host, uint16_t delta, uint16_t *sensor_cnt, acct_gather_energy_t **energy) { int rc; slurm_msg_t req_msg; slurm_msg_t resp_msg; acct_gather_energy_req_msg_t req; uint32_t cluster_flags = slurmdb_setup_cluster_flags(); char *this_addr; xassert(sensor_cnt); xassert(energy); *sensor_cnt = 0; *energy = NULL; slurm_msg_t_init(&req_msg); slurm_msg_t_init(&resp_msg); if (host) slurm_conf_get_addr(host, &req_msg.address); else if (cluster_flags & CLUSTER_FLAG_MULTSD) { if ((this_addr = getenv("SLURMD_NODENAME"))) { slurm_conf_get_addr(this_addr, &req_msg.address); } else { this_addr = "localhost"; slurm_set_addr(&req_msg.address, (uint16_t)slurm_get_slurmd_port(), this_addr); } } else { char this_host[256]; /* * Set request message address to slurmd on localhost */ gethostname_short(this_host, sizeof(this_host)); this_addr = slurm_conf_get_nodeaddr(this_host); if (this_addr == NULL) this_addr = xstrdup("localhost"); slurm_set_addr(&req_msg.address, (uint16_t)slurm_get_slurmd_port(), this_addr); xfree(this_addr); } req.delta = delta; req_msg.msg_type = REQUEST_ACCT_GATHER_ENERGY; req_msg.data = &req; rc = slurm_send_recv_node_msg(&req_msg, &resp_msg, 0); if (rc != 0 || !resp_msg.auth_cred) { error("slurm_get_node_energy: %m"); if (resp_msg.auth_cred) g_slurm_auth_destroy(resp_msg.auth_cred); return SLURM_ERROR; } if (resp_msg.auth_cred) g_slurm_auth_destroy(resp_msg.auth_cred); switch (resp_msg.msg_type) { case RESPONSE_ACCT_GATHER_ENERGY: *sensor_cnt = ((acct_gather_node_resp_msg_t *) resp_msg.data)->sensor_cnt; *energy = ((acct_gather_node_resp_msg_t *) resp_msg.data)->energy; ((acct_gather_node_resp_msg_t *) resp_msg.data)->energy = NULL; slurm_free_acct_gather_node_resp_msg(resp_msg.data); break; case RESPONSE_SLURM_RC: rc = ((return_code_msg_t *) resp_msg.data)->return_code; slurm_free_return_code_msg(resp_msg.data); if (rc) slurm_seterrno_ret(rc); break; default: slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR); break; } return SLURM_PROTOCOL_SUCCESS; }
static int _load_fed_nodes(slurm_msg_t *req_msg, node_info_msg_t **node_info_msg_pptr, uint16_t show_flags, char *cluster_name, slurmdb_federation_rec_t *fed) { int cluster_inx = 0, i; load_node_resp_struct_t *node_resp; node_info_msg_t *orig_msg = NULL, *new_msg = NULL; uint32_t new_rec_cnt; slurmdb_cluster_rec_t *cluster; ListIterator iter; int pthread_count = 0; pthread_t *load_thread = 0; load_node_req_struct_t *load_args; List resp_msg_list; *node_info_msg_pptr = NULL; /* Spawn one pthread per cluster to collect node information */ resp_msg_list = list_create(NULL); load_thread = xmalloc(sizeof(pthread_t) * list_count(fed->cluster_list)); iter = list_iterator_create(fed->cluster_list); while ((cluster = (slurmdb_cluster_rec_t *) list_next(iter))) { if ((cluster->control_host == NULL) || (cluster->control_host[0] == '\0')) continue; /* Cluster down */ load_args = xmalloc(sizeof(load_node_req_struct_t)); load_args->cluster = cluster; load_args->cluster_inx = cluster_inx++; load_args->req_msg = req_msg; load_args->resp_msg_list = resp_msg_list; load_args->show_flags = show_flags; slurm_thread_create(&load_thread[pthread_count], _load_node_thread, load_args); pthread_count++; } list_iterator_destroy(iter); /* Wait for all pthreads to complete */ for (i = 0; i < pthread_count; i++) pthread_join(load_thread[i], NULL); xfree(load_thread); /* Maintain a consistent cluster/node ordering */ list_sort(resp_msg_list, _sort_by_cluster_inx); /* Merge the responses into a single response message */ iter = list_iterator_create(resp_msg_list); while ((node_resp = (load_node_resp_struct_t *) list_next(iter))) { new_msg = node_resp->new_msg; if (!orig_msg) { orig_msg = new_msg; *node_info_msg_pptr = orig_msg; } else { /* Merge the node records */ orig_msg->last_update = MIN(orig_msg->last_update, new_msg->last_update); new_rec_cnt = orig_msg->record_count + new_msg->record_count; if (new_msg->record_count) { orig_msg->node_array = xrealloc(orig_msg->node_array, sizeof(node_info_t) * new_rec_cnt); (void) memcpy(orig_msg->node_array + orig_msg->record_count, new_msg->node_array, sizeof(node_info_t) * new_msg->record_count); orig_msg->record_count = new_rec_cnt; } xfree(new_msg->node_array); xfree(new_msg); } xfree(node_resp); } list_iterator_destroy(iter); FREE_NULL_LIST(resp_msg_list); if (!orig_msg) slurm_seterrno_ret(SLURM_ERROR); return SLURM_SUCCESS; }
/* * slurm_get_end_time - get the expected end time for a given slurm job * IN jobid - slurm job id * end_time_ptr - location in which to store scheduled end time for job * RET 0 or -1 on error */ extern int slurm_get_end_time(uint32_t jobid, time_t *end_time_ptr) { int rc; slurm_msg_t resp_msg; slurm_msg_t req_msg; job_alloc_info_msg_t job_msg; srun_timeout_msg_t *timeout_msg; time_t now = time(NULL); static uint32_t jobid_cache = 0; static uint32_t jobid_env = 0; static time_t endtime_cache = 0; static time_t last_test_time = 0; slurm_msg_t_init(&req_msg); slurm_msg_t_init(&resp_msg); if (!end_time_ptr) slurm_seterrno_ret(EINVAL); if (jobid == 0) { if (jobid_env) { jobid = jobid_env; } else { char *env = getenv("SLURM_JOB_ID"); if (env) { jobid = (uint32_t) atol(env); jobid_env = jobid; } } if (jobid == 0) { slurm_seterrno(ESLURM_INVALID_JOB_ID); return SLURM_ERROR; } } /* Just use cached data if data less than 60 seconds old */ if ((jobid == jobid_cache) && (difftime(now, last_test_time) < 60)) { *end_time_ptr = endtime_cache; return SLURM_SUCCESS; } job_msg.job_id = jobid; req_msg.msg_type = REQUEST_JOB_END_TIME; req_msg.data = &job_msg; if (slurm_send_recv_controller_msg( &req_msg, &resp_msg) < 0) return SLURM_ERROR; switch (resp_msg.msg_type) { case SRUN_TIMEOUT: timeout_msg = (srun_timeout_msg_t *) resp_msg.data; last_test_time = time(NULL); jobid_cache = jobid; endtime_cache = timeout_msg->timeout; *end_time_ptr = endtime_cache; slurm_free_srun_timeout_msg(resp_msg.data); break; case RESPONSE_SLURM_RC: rc = ((return_code_msg_t *) resp_msg.data)->return_code; slurm_free_return_code_msg(resp_msg.data); if (endtime_cache) *end_time_ptr = endtime_cache; else if (rc) slurm_seterrno_ret(rc); break; default: if (endtime_cache) *end_time_ptr = endtime_cache; else slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR); break; } return SLURM_SUCCESS; }
/* * slurm_network_callerid - issue RPC to get the job id of a job from a remote * slurmd based upon network socket information. * * IN req - Information about network connection in question * OUT job_id - ID of the job or NO_VAL * OUT node_name - name of the remote slurmd * IN node_name_size - size of the node_name buffer * RET SLURM_PROTOCOL_SUCCESS or SLURM_FAILURE on error */ extern int slurm_network_callerid (network_callerid_msg_t req, uint32_t *job_id, char *node_name, int node_name_size) { int rc; slurm_msg_t resp_msg; slurm_msg_t req_msg; network_callerid_resp_t *resp; struct sockaddr_in addr; uint32_t target_slurmd; /* change for IPv6 support */ debug("slurm_network_callerid RPC: start"); slurm_msg_t_init(&req_msg); slurm_msg_t_init(&resp_msg); /* ip_src is the IP we want to talk to. Hopefully there's a slurmd * listening there */ memset(&addr, 0, sizeof(addr)); addr.sin_family = req.af; /* TODO: until IPv6 support is added to Slurm, we must hope that the * other end is IPv4 */ if (req.af == AF_INET6) { error("IPv6 is not yet supported in Slurm"); /* For testing IPv6 callerid prior to Slurm IPv6 RPC support, * set a sane target, uncomment the following and comment out * the return code: addr.sin_family = AF_INET; target_slurmd = inet_addr("127.0.0.1"); //choose a test target */ return SLURM_FAILURE; } else memcpy(&target_slurmd, req.ip_src, 4); addr.sin_addr.s_addr = target_slurmd; addr.sin_port = htons(slurm_get_slurmd_port()); req_msg.address = addr; req_msg.msg_type = REQUEST_NETWORK_CALLERID; req_msg.data = &req; if (slurm_send_recv_node_msg(&req_msg, &resp_msg, 0) < 0) return SLURM_ERROR; switch (resp_msg.msg_type) { case RESPONSE_NETWORK_CALLERID: resp = (network_callerid_resp_t*)resp_msg.data; *job_id = resp->job_id; strncpy(node_name, resp->node_name, node_name_size); break; case RESPONSE_SLURM_RC: rc = ((return_code_msg_t *) resp_msg.data)->return_code; if (rc) slurm_seterrno_ret(rc); break; default: slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR); break; } slurm_free_network_callerid_msg(resp_msg.data); return SLURM_PROTOCOL_SUCCESS; }