static int eliminate_nodes (char **hosts) { hostlist_t hl = NULL; hostlist_t hlnew = NULL; hostlist_iterator_t hitr = NULL; ipmidetect_t id = NULL; char *host = NULL; char hostbuf[HOSTLIST_BUFLEN + 1]; int rv = -1; assert (hosts); assert (*hosts); if (!(id = ipmidetect_handle_create ())) { fprintf (stderr, "ipmidetect_handle_create\n"); goto cleanup; } if (ipmidetect_load_data (id, NULL, 0, 0) < 0) { if (ipmidetect_errnum (id) == IPMIDETECT_ERR_CONNECT || ipmidetect_errnum (id) == IPMIDETECT_ERR_CONNECT_TIMEOUT) fprintf (stderr, "Error connecting to ipmidetect daemon\n"); else fprintf (stderr, "ipmidetect_load_data: %s\n", ipmidetect_errormsg (id)); goto cleanup; } if (!(hl = hostlist_create (*hosts))) { fprintf (stderr, "hostlist_create: %s\n", strerror (errno)); goto cleanup; } if (!(hlnew = hostlist_create (*hosts))) { fprintf (stderr, "hostlist_create: %s\n", strerror (errno)); goto cleanup; } if (!(hitr = hostlist_iterator_create (hl))) { fprintf (stderr, "hostlist_iterator_create: %s\n", strerror (errno)); goto cleanup; } while ((host = hostlist_next (hitr))) { int ret; if ((ret = ipmidetect_is_node_detected (id, host)) < 0) { if (ipmidetect_errnum (id) == IPMIDETECT_ERR_NOTFOUND) fprintf (stderr, "Node '%s' unrecognized by ipmidetect\n", host); else fprintf (stderr, "ipmidetect_is_node_detected: %s\n", ipmidetect_errormsg (id)); goto cleanup; } if (!ret) hostlist_delete (hlnew, host); free (host); } host = NULL; if (!hostlist_count (hlnew)) { rv = 0; goto cleanup; } memset (hostbuf, '\0', HOSTLIST_BUFLEN + 1); if (hostlist_ranged_string (hlnew, HOSTLIST_BUFLEN, hostbuf) < 0) { fprintf (stderr, "hostlist_ranged_string: truncation\n"); goto cleanup; } free (*hosts); if (!(*hosts = strdup (hostbuf))) { fprintf (stderr, "strdup: %s\n", strerror (errno)); goto cleanup; } rv = hostlist_count (hlnew); cleanup: if (id) ipmidetect_handle_destroy (id); if (hitr) hostlist_iterator_destroy (hitr); if (hl) hostlist_destroy (hl); if (hlnew) hostlist_destroy (hlnew); free (host); return (rv); }
/* * Create an srun job structure for a step w/out an allocation response msg. * (i.e. inside an allocation) */ srun_job_t * job_step_create_allocation(resource_allocation_response_msg_t *resp) { uint32_t job_id = resp->job_id; srun_job_t *job = NULL; allocation_info_t *ai = xmalloc(sizeof(*ai)); hostlist_t hl = NULL; char *buf = NULL; int count = 0; uint32_t alloc_count = 0; ai->jobid = job_id; ai->stepid = NO_VAL; ai->nodelist = opt.alloc_nodelist; hl = hostlist_create(ai->nodelist); hostlist_uniq(hl); alloc_count = hostlist_count(hl); ai->nnodes = alloc_count; hostlist_destroy(hl); if (opt.exc_nodes) { hostlist_t exc_hl = hostlist_create(opt.exc_nodes); hostlist_t inc_hl = NULL; char *node_name = NULL; hl = hostlist_create(ai->nodelist); if(opt.nodelist) { inc_hl = hostlist_create(opt.nodelist); } hostlist_uniq(hl); //info("using %s or %s", opt.nodelist, ai->nodelist); while ((node_name = hostlist_shift(exc_hl))) { int inx = hostlist_find(hl, node_name); if (inx >= 0) { debug("excluding node %s", node_name); hostlist_delete_nth(hl, inx); ai->nnodes--; /* decrement node count */ } if(inc_hl) { inx = hostlist_find(inc_hl, node_name); if (inx >= 0) { error("Requested node %s is also " "in the excluded list.", node_name); error("Job not submitted."); hostlist_destroy(exc_hl); hostlist_destroy(inc_hl); goto error; } } free(node_name); } hostlist_destroy(exc_hl); /* we need to set this here so if there are more nodes * available than we requested we can set it * straight. If there is no exclude list then we set * the vars then. */ if (!opt.nodes_set) { /* we don't want to set the number of nodes = * to the number of requested processes unless we * know it is less than the number of nodes * in the allocation */ if(opt.ntasks_set && (opt.ntasks < ai->nnodes)) opt.min_nodes = opt.ntasks; else opt.min_nodes = ai->nnodes; opt.nodes_set = true; } if(!opt.max_nodes) opt.max_nodes = opt.min_nodes; if((opt.max_nodes > 0) && (opt.max_nodes < ai->nnodes)) ai->nnodes = opt.max_nodes; count = hostlist_count(hl); if(!count) { error("Hostlist is now nothing! Can't run job."); hostlist_destroy(hl); goto error; } if(inc_hl) { count = hostlist_count(inc_hl); if(count < ai->nnodes) { /* add more nodes to get correct number for allocation */ hostlist_t tmp_hl = hostlist_copy(hl); int i=0; int diff = ai->nnodes - count; buf = hostlist_ranged_string_xmalloc(inc_hl); hostlist_delete(tmp_hl, buf); xfree(buf); while ((node_name = hostlist_shift(tmp_hl)) && (i < diff)) { hostlist_push(inc_hl, node_name); i++; } hostlist_destroy(tmp_hl); } buf = hostlist_ranged_string_xmalloc(inc_hl); hostlist_destroy(inc_hl); xfree(opt.nodelist); opt.nodelist = buf; } else { if (count > ai->nnodes) { /* remove more nodes than needed for allocation */ int i=0; for (i=count; i>ai->nnodes; i--) hostlist_delete_nth(hl, i); } xfree(opt.nodelist); opt.nodelist = hostlist_ranged_string_xmalloc(hl); } hostlist_destroy(hl); } else { if (!opt.nodes_set) { /* we don't want to set the number of nodes = * to the number of requested processes unless we * know it is less than the number of nodes * in the allocation */ if(opt.ntasks_set && (opt.ntasks < ai->nnodes)) opt.min_nodes = opt.ntasks; else opt.min_nodes = ai->nnodes; opt.nodes_set = true; } if(!opt.max_nodes) opt.max_nodes = opt.min_nodes; if((opt.max_nodes > 0) && (opt.max_nodes < ai->nnodes)) ai->nnodes = opt.max_nodes; /* Don't reset the ai->nodelist because that is the * nodelist we want to say the allocation is under * opt.nodelist is what is used for the allocation. */ /* xfree(ai->nodelist); */ /* ai->nodelist = xstrdup(buf); */ } /* get the correct number of hosts to run tasks on */ if (opt.nodelist) { hl = hostlist_create(opt.nodelist); if (opt.distribution != SLURM_DIST_ARBITRARY) hostlist_uniq(hl); if (!hostlist_count(hl)) { error("Hostlist is now nothing! Can not run job."); hostlist_destroy(hl); goto error; } buf = hostlist_ranged_string_xmalloc(hl); count = hostlist_count(hl); hostlist_destroy(hl); /* Don't reset the ai->nodelist because that is the * nodelist we want to say the allocation is under * opt.nodelist is what is used for the allocation. */ /* xfree(ai->nodelist); */ /* ai->nodelist = xstrdup(buf); */ xfree(opt.nodelist); opt.nodelist = buf; } if (opt.distribution == SLURM_DIST_ARBITRARY) { if (count != opt.ntasks) { error("You asked for %d tasks but specified %d nodes", opt.ntasks, count); goto error; } } if (ai->nnodes == 0) { error("No nodes in allocation, can't run job"); goto error; } ai->num_cpu_groups = resp->num_cpu_groups; ai->cpus_per_node = resp->cpus_per_node; ai->cpu_count_reps = resp->cpu_count_reps; /* info("looking for %d nodes out of %s with a must list of %s", */ /* ai->nnodes, ai->nodelist, opt.nodelist); */ /* * Create job */ job = _job_create_structure(ai); error: xfree(ai); return (job); }