/* * slurm_load_slurmd_status - issue RPC to get the status of slurmd * daemon on this machine * IN slurmd_info_ptr - place to store slurmd status information * RET 0 or -1 on error * NOTE: free the response using slurm_free_slurmd_status() */ extern int slurm_load_slurmd_status(slurmd_status_t **slurmd_status_ptr) { int rc; slurm_msg_t req_msg; slurm_msg_t resp_msg; uint32_t cluster_flags = slurmdb_setup_cluster_flags(); char *this_addr; slurm_msg_t_init(&req_msg); slurm_msg_t_init(&resp_msg); if (cluster_flags & CLUSTER_FLAG_MULTSD) { if ((this_addr = getenv("SLURMD_NODENAME"))) { slurm_conf_get_addr(this_addr, &req_msg.address); } else { this_addr = "localhost"; slurm_set_addr(&req_msg.address, (uint16_t)slurm_get_slurmd_port(), this_addr); } } else { char this_host[256]; /* * Set request message address to slurmd on localhost */ gethostname_short(this_host, sizeof(this_host)); this_addr = slurm_conf_get_nodeaddr(this_host); if (this_addr == NULL) this_addr = xstrdup("localhost"); slurm_set_addr(&req_msg.address, (uint16_t)slurm_get_slurmd_port(), this_addr); xfree(this_addr); } req_msg.msg_type = REQUEST_DAEMON_STATUS; req_msg.data = NULL; rc = slurm_send_recv_node_msg(&req_msg, &resp_msg, 0); if ((rc != 0) || !resp_msg.auth_cred) { error("slurm_slurmd_info: %m"); if (resp_msg.auth_cred) g_slurm_auth_destroy(resp_msg.auth_cred); return SLURM_ERROR; } if (resp_msg.auth_cred) g_slurm_auth_destroy(resp_msg.auth_cred); switch (resp_msg.msg_type) { case RESPONSE_SLURMD_STATUS: *slurmd_status_ptr = (slurmd_status_t *) resp_msg.data; break; case RESPONSE_SLURM_RC: rc = ((return_code_msg_t *) resp_msg.data)->return_code; slurm_free_return_code_msg(resp_msg.data); if (rc) slurm_seterrno_ret(rc); break; default: slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR); break; } return SLURM_PROTOCOL_SUCCESS; }
/* * slurm_network_callerid - issue RPC to get the job id of a job from a remote * slurmd based upon network socket information. * * IN req - Information about network connection in question * OUT job_id - ID of the job or NO_VAL * OUT node_name - name of the remote slurmd * IN node_name_size - size of the node_name buffer * RET SLURM_PROTOCOL_SUCCESS or SLURM_FAILURE on error */ extern int slurm_network_callerid (network_callerid_msg_t req, uint32_t *job_id, char *node_name, int node_name_size) { int rc; slurm_msg_t resp_msg; slurm_msg_t req_msg; network_callerid_resp_t *resp; struct sockaddr_in addr; uint32_t target_slurmd; /* change for IPv6 support */ debug("slurm_network_callerid RPC: start"); slurm_msg_t_init(&req_msg); slurm_msg_t_init(&resp_msg); /* ip_src is the IP we want to talk to. Hopefully there's a slurmd * listening there */ memset(&addr, 0, sizeof(addr)); addr.sin_family = req.af; /* TODO: until IPv6 support is added to Slurm, we must hope that the * other end is IPv4 */ if (req.af == AF_INET6) { error("IPv6 is not yet supported in Slurm"); /* For testing IPv6 callerid prior to Slurm IPv6 RPC support, * set a sane target, uncomment the following and comment out * the return code: addr.sin_family = AF_INET; target_slurmd = inet_addr("127.0.0.1"); //choose a test target */ return SLURM_FAILURE; } else memcpy(&target_slurmd, req.ip_src, 4); addr.sin_addr.s_addr = target_slurmd; addr.sin_port = htons(slurm_get_slurmd_port()); req_msg.address = addr; req_msg.msg_type = REQUEST_NETWORK_CALLERID; req_msg.data = &req; if (slurm_send_recv_node_msg(&req_msg, &resp_msg, 0) < 0) return SLURM_ERROR; switch (resp_msg.msg_type) { case RESPONSE_NETWORK_CALLERID: resp = (network_callerid_resp_t*)resp_msg.data; *job_id = resp->job_id; strncpy(node_name, resp->node_name, node_name_size); break; case RESPONSE_SLURM_RC: rc = ((return_code_msg_t *) resp_msg.data)->return_code; if (rc) slurm_seterrno_ret(rc); break; default: slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR); break; } slurm_free_network_callerid_msg(resp_msg.data); return SLURM_PROTOCOL_SUCCESS; }
/* * slurm_get_node_energy_n - issue RPC to get the energy data of all * configured sensors on the target machine * IN host - name of node to query, NULL if localhost * IN delta - Use cache if data is newer than this in seconds * OUT sensors_cnt - number of sensors * OUT energy - array of acct_gather_energy_t structures on success or * NULL other wise * RET 0 on success or a slurm error code * NOTE: free the response using xfree */ extern int slurm_get_node_energy(char *host, uint16_t delta, uint16_t *sensor_cnt, acct_gather_energy_t **energy) { int rc; slurm_msg_t req_msg; slurm_msg_t resp_msg; acct_gather_energy_req_msg_t req; uint32_t cluster_flags = slurmdb_setup_cluster_flags(); char *this_addr; xassert(sensor_cnt); xassert(energy); *sensor_cnt = 0; *energy = NULL; slurm_msg_t_init(&req_msg); slurm_msg_t_init(&resp_msg); if (host) slurm_conf_get_addr(host, &req_msg.address); else if (cluster_flags & CLUSTER_FLAG_MULTSD) { if ((this_addr = getenv("SLURMD_NODENAME"))) { slurm_conf_get_addr(this_addr, &req_msg.address); } else { this_addr = "localhost"; slurm_set_addr(&req_msg.address, (uint16_t)slurm_get_slurmd_port(), this_addr); } } else { char this_host[256]; /* * Set request message address to slurmd on localhost */ gethostname_short(this_host, sizeof(this_host)); this_addr = slurm_conf_get_nodeaddr(this_host); if (this_addr == NULL) this_addr = xstrdup("localhost"); slurm_set_addr(&req_msg.address, (uint16_t)slurm_get_slurmd_port(), this_addr); xfree(this_addr); } req.delta = delta; req_msg.msg_type = REQUEST_ACCT_GATHER_ENERGY; req_msg.data = &req; rc = slurm_send_recv_node_msg(&req_msg, &resp_msg, 0); if (rc != 0 || !resp_msg.auth_cred) { error("slurm_get_node_energy: %m"); if (resp_msg.auth_cred) g_slurm_auth_destroy(resp_msg.auth_cred); return SLURM_ERROR; } if (resp_msg.auth_cred) g_slurm_auth_destroy(resp_msg.auth_cred); switch (resp_msg.msg_type) { case RESPONSE_ACCT_GATHER_ENERGY: *sensor_cnt = ((acct_gather_node_resp_msg_t *) resp_msg.data)->sensor_cnt; *energy = ((acct_gather_node_resp_msg_t *) resp_msg.data)->energy; ((acct_gather_node_resp_msg_t *) resp_msg.data)->energy = NULL; slurm_free_acct_gather_node_resp_msg(resp_msg.data); break; case RESPONSE_SLURM_RC: rc = ((return_code_msg_t *) resp_msg.data)->return_code; slurm_free_return_code_msg(resp_msg.data); if (rc) slurm_seterrno_ret(rc); break; default: slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR); break; } return SLURM_PROTOCOL_SUCCESS; }
/* * slurm_sprint_node_table - output information about a specific Slurm nodes * based upon message as loaded using slurm_load_node * IN node_ptr - an individual node information record pointer * IN one_liner - print as a single line if true * RET out - char * containing formatted output (must be freed after call) * NULL is returned on failure. */ char *slurm_sprint_node_table(node_info_t *node_ptr, int one_liner) { uint32_t my_state = node_ptr->node_state; char *cloud_str = "", *comp_str = "", *drain_str = "", *power_str = ""; char time_str[32]; char *out = NULL, *reason_str = NULL; uint16_t alloc_cpus = 0; int idle_cpus; uint64_t alloc_memory; char *node_alloc_tres = NULL; char *line_end = (one_liner) ? " " : "\n "; slurm_ctl_conf_info_msg_t *slurm_ctl_conf_ptr = NULL; if (slurm_load_ctl_conf((time_t) NULL, &slurm_ctl_conf_ptr) != SLURM_SUCCESS) fatal("Cannot load slurmctld conf file"); if (my_state & NODE_STATE_CLOUD) { my_state &= (~NODE_STATE_CLOUD); cloud_str = "+CLOUD"; } if (my_state & NODE_STATE_COMPLETING) { my_state &= (~NODE_STATE_COMPLETING); comp_str = "+COMPLETING"; } if (my_state & NODE_STATE_DRAIN) { my_state &= (~NODE_STATE_DRAIN); drain_str = "+DRAIN"; } if (my_state & NODE_STATE_FAIL) { my_state &= (~NODE_STATE_FAIL); drain_str = "+FAIL"; } if (my_state & NODE_STATE_POWER_SAVE) { my_state &= (~NODE_STATE_POWER_SAVE); power_str = "+POWER"; } if (my_state & NODE_STATE_POWERING_DOWN) { my_state &= (~NODE_STATE_POWERING_DOWN); power_str = "+POWERING_DOWN"; } slurm_get_select_nodeinfo(node_ptr->select_nodeinfo, SELECT_NODEDATA_SUBCNT, NODE_STATE_ALLOCATED, &alloc_cpus); idle_cpus = node_ptr->cpus - alloc_cpus; if (idle_cpus && (idle_cpus != node_ptr->cpus)) { my_state &= NODE_STATE_FLAGS; my_state |= NODE_STATE_MIXED; } /****** Line 1 ******/ xstrfmtcat(out, "NodeName=%s ", node_ptr->name); if (node_ptr->arch) xstrfmtcat(out, "Arch=%s ", node_ptr->arch); if (node_ptr->cpu_bind) { char tmp_str[128]; slurm_sprint_cpu_bind_type(tmp_str, node_ptr->cpu_bind); xstrfmtcat(out, "CpuBind=%s ", tmp_str); } xstrfmtcat(out, "CoresPerSocket=%u ", node_ptr->cores); xstrcat(out, line_end); /****** Line ******/ xstrfmtcat(out, "CPUAlloc=%u CPUTot=%u ", alloc_cpus, node_ptr->cpus); if (node_ptr->cpu_load == NO_VAL) xstrcat(out, "CPULoad=N/A"); else xstrfmtcat(out, "CPULoad=%.2f", (node_ptr->cpu_load / 100.0)); xstrcat(out, line_end); /****** Line ******/ xstrfmtcat(out, "AvailableFeatures=%s", node_ptr->features); xstrcat(out, line_end); /****** Line ******/ xstrfmtcat(out, "ActiveFeatures=%s", node_ptr->features_act); xstrcat(out, line_end); /****** Line ******/ xstrfmtcat(out, "Gres=%s", node_ptr->gres); xstrcat(out, line_end); /****** Line (optional) ******/ if (node_ptr->gres_drain) { xstrfmtcat(out, "GresDrain=%s", node_ptr->gres_drain); xstrcat(out, line_end); } /****** Line (optional) ******/ if (node_ptr->gres_used) { xstrfmtcat(out, "GresUsed=%s", node_ptr->gres_used); xstrcat(out, line_end); } /****** Line (optional) ******/ { bool line_used = false; if (node_ptr->node_addr) { xstrfmtcat(out, "NodeAddr=%s ", node_ptr->node_addr); line_used = true; } if (node_ptr->node_hostname) { xstrfmtcat(out, "NodeHostName=%s ", node_ptr->node_hostname); line_used = true; } if (node_ptr->port != slurm_get_slurmd_port()) { xstrfmtcat(out, "Port=%u ", node_ptr->port); line_used = true; } if (node_ptr->version && xstrcmp(node_ptr->version, slurm_ctl_conf_ptr->version)) { xstrfmtcat(out, "Version=%s", node_ptr->version); line_used = true; } if (line_used) xstrcat(out, line_end); } /****** Line ******/ if (node_ptr->os) { xstrfmtcat(out, "OS=%s ", node_ptr->os); xstrcat(out, line_end); } /****** Line ******/ slurm_get_select_nodeinfo(node_ptr->select_nodeinfo, SELECT_NODEDATA_MEM_ALLOC, NODE_STATE_ALLOCATED, &alloc_memory); xstrfmtcat(out, "RealMemory=%"PRIu64" AllocMem=%"PRIu64" ", node_ptr->real_memory, alloc_memory); if (node_ptr->free_mem == NO_VAL64) xstrcat(out, "FreeMem=N/A "); else xstrfmtcat(out, "FreeMem=%"PRIu64" ", node_ptr->free_mem); xstrfmtcat(out, "Sockets=%u Boards=%u", node_ptr->sockets, node_ptr->boards); xstrcat(out, line_end); /****** core & memory specialization Line (optional) ******/ if (node_ptr->core_spec_cnt || node_ptr->cpu_spec_list || node_ptr->mem_spec_limit) { if (node_ptr->core_spec_cnt) { xstrfmtcat(out, "CoreSpecCount=%u ", node_ptr->core_spec_cnt); } if (node_ptr->cpu_spec_list) { xstrfmtcat(out, "CPUSpecList=%s ", node_ptr->cpu_spec_list); } if (node_ptr->mem_spec_limit) { xstrfmtcat(out, "MemSpecLimit=%"PRIu64"", node_ptr->mem_spec_limit); } xstrcat(out, line_end); } /****** Line ******/ xstrfmtcat(out, "State=%s%s%s%s%s ThreadsPerCore=%u TmpDisk=%u Weight=%u ", node_state_string(my_state), cloud_str, comp_str, drain_str, power_str, node_ptr->threads, node_ptr->tmp_disk, node_ptr->weight); if (node_ptr->owner == NO_VAL) { xstrcat(out, "Owner=N/A "); } else { char *user_name = uid_to_string((uid_t) node_ptr->owner); xstrfmtcat(out, "Owner=%s(%u) ", user_name, node_ptr->owner); xfree(user_name); } xstrfmtcat(out, "MCS_label=%s", (node_ptr->mcs_label == NULL) ? "N/A" : node_ptr->mcs_label); xstrcat(out, line_end); /****** Line ******/ if ((node_ptr->next_state != NO_VAL) && (my_state & NODE_STATE_REBOOT)) { xstrfmtcat(out, "NextState=%s", node_state_string(node_ptr->next_state)); xstrcat(out, line_end); } /****** Line ******/ if (node_ptr->partitions) { xstrfmtcat(out, "Partitions=%s ", node_ptr->partitions); xstrcat(out, line_end); } /****** Line ******/ if (node_ptr->boot_time) { slurm_make_time_str((time_t *)&node_ptr->boot_time, time_str, sizeof(time_str)); xstrfmtcat(out, "BootTime=%s ", time_str); } else { xstrcat(out, "BootTime=None "); } if (node_ptr->slurmd_start_time) { slurm_make_time_str ((time_t *)&node_ptr->slurmd_start_time, time_str, sizeof(time_str)); xstrfmtcat(out, "SlurmdStartTime=%s", time_str); } else { xstrcat(out, "SlurmdStartTime=None"); } xstrcat(out, line_end); /****** TRES Line ******/ select_g_select_nodeinfo_get(node_ptr->select_nodeinfo, SELECT_NODEDATA_TRES_ALLOC_FMT_STR, NODE_STATE_ALLOCATED, &node_alloc_tres); xstrfmtcat(out, "CfgTRES=%s", node_ptr->tres_fmt_str); xstrcat(out, line_end); xstrfmtcat(out, "AllocTRES=%s", (node_alloc_tres) ? node_alloc_tres : ""); xfree(node_alloc_tres); xstrcat(out, line_end); /****** Power Management Line ******/ if (!node_ptr->power || (node_ptr->power->cap_watts == NO_VAL)) xstrcat(out, "CapWatts=n/a"); else xstrfmtcat(out, "CapWatts=%u", node_ptr->power->cap_watts); xstrcat(out, line_end); /****** Power Consumption Line ******/ if (!node_ptr->energy || node_ptr->energy->current_watts == NO_VAL) xstrcat(out, "CurrentWatts=n/s AveWatts=n/s"); else xstrfmtcat(out, "CurrentWatts=%u AveWatts=%u", node_ptr->energy->current_watts, node_ptr->energy->ave_watts); xstrcat(out, line_end); /****** external sensors Line ******/ if (!node_ptr->ext_sensors || node_ptr->ext_sensors->consumed_energy == NO_VAL64) xstrcat(out, "ExtSensorsJoules=n/s "); else xstrfmtcat(out, "ExtSensorsJoules=%"PRIu64" ", node_ptr->ext_sensors->consumed_energy); if (!node_ptr->ext_sensors || node_ptr->ext_sensors->current_watts == NO_VAL) xstrcat(out, "ExtSensorsWatts=n/s "); else xstrfmtcat(out, "ExtSensorsWatts=%u ", node_ptr->ext_sensors->current_watts); if (!node_ptr->ext_sensors || node_ptr->ext_sensors->temperature == NO_VAL) xstrcat(out, "ExtSensorsTemp=n/s"); else xstrfmtcat(out, "ExtSensorsTemp=%u", node_ptr->ext_sensors->temperature); xstrcat(out, line_end); /****** Line ******/ if (node_ptr->reason && node_ptr->reason[0]) xstrcat(reason_str, node_ptr->reason); if (reason_str) { int inx = 1; char *save_ptr = NULL, *tok, *user_name; tok = strtok_r(reason_str, "\n", &save_ptr); while (tok) { if (inx == 1) { xstrcat(out, "Reason="); } else { xstrcat(out, line_end); xstrcat(out, " "); } xstrfmtcat(out, "%s", tok); if ((inx++ == 1) && node_ptr->reason_time) { user_name = uid_to_string(node_ptr->reason_uid); slurm_make_time_str((time_t *)&node_ptr->reason_time, time_str, sizeof(time_str)); xstrfmtcat(out, " [%s@%s]", user_name, time_str); xfree(user_name); } tok = strtok_r(NULL, "\n", &save_ptr); } xfree(reason_str); } if (one_liner) xstrcat(out, "\n"); else xstrcat(out, "\n\n"); slurm_free_ctl_conf(slurm_ctl_conf_ptr); return out; }