static int
eliminate_nodes (char **hosts)
{
  hostlist_t hl = NULL;
  hostlist_t hlnew = NULL;
  hostlist_iterator_t hitr = NULL;
  ipmidetect_t id = NULL;
  char *host = NULL;
  char hostbuf[HOSTLIST_BUFLEN + 1];
  int rv = -1;

  assert (hosts);
  assert (*hosts);

  if (!(id = ipmidetect_handle_create ()))
    {
      fprintf (stderr,
               "ipmidetect_handle_create\n");
      goto cleanup;
    }

  if (ipmidetect_load_data (id,
                            NULL,
                            0,
                            0) < 0)
    {
      if (ipmidetect_errnum (id) == IPMIDETECT_ERR_CONNECT
          || ipmidetect_errnum (id) == IPMIDETECT_ERR_CONNECT_TIMEOUT)
        fprintf (stderr,
                 "Error connecting to ipmidetect daemon\n");
      else
        fprintf (stderr,
                 "ipmidetect_load_data: %s\n", ipmidetect_errormsg (id));
      goto cleanup;
    }

  if (!(hl = hostlist_create (*hosts)))
    {
      fprintf (stderr,
               "hostlist_create: %s\n",
               strerror (errno));
      goto cleanup;
    }

  if (!(hlnew = hostlist_create (*hosts)))
    {
      fprintf (stderr,
               "hostlist_create: %s\n",
               strerror (errno));
      goto cleanup;
    }

  if (!(hitr = hostlist_iterator_create (hl)))
    {
      fprintf (stderr,
               "hostlist_iterator_create: %s\n",
               strerror (errno));
      goto cleanup;
    }

  while ((host = hostlist_next (hitr)))
    {
      int ret;

      if ((ret = ipmidetect_is_node_detected (id, host)) < 0)
        {
          if (ipmidetect_errnum (id) == IPMIDETECT_ERR_NOTFOUND)
            fprintf (stderr,
                     "Node '%s' unrecognized by ipmidetect\n", host);
          else
            fprintf (stderr,
                     "ipmidetect_is_node_detected: %s\n", ipmidetect_errormsg (id));
          goto cleanup;
        }

      if (!ret)
        hostlist_delete (hlnew, host);

      free (host);
    }
  host = NULL;

  if (!hostlist_count (hlnew))
    {
      rv = 0;
      goto cleanup;
    }
 
  memset (hostbuf, '\0', HOSTLIST_BUFLEN + 1);
 
  if (hostlist_ranged_string (hlnew, HOSTLIST_BUFLEN, hostbuf) < 0)
    {
      fprintf (stderr,
               "hostlist_ranged_string: truncation\n");
      goto cleanup;
    }

  free (*hosts);
  if (!(*hosts = strdup (hostbuf)))
    {
      fprintf (stderr, "strdup: %s\n", strerror (errno));
      goto cleanup;
    }

  rv = hostlist_count (hlnew);
 cleanup:
  if (id)
    ipmidetect_handle_destroy (id);
  if (hitr)
    hostlist_iterator_destroy (hitr);
  if (hl)
    hostlist_destroy (hl);
  if (hlnew)
    hostlist_destroy (hlnew);
  free (host);
  return (rv);
}
Exemple #2
0
/*
 * Create an srun job structure for a step w/out an allocation response msg.
 * (i.e. inside an allocation)
 */
srun_job_t *
job_step_create_allocation(resource_allocation_response_msg_t *resp)
{
	uint32_t job_id = resp->job_id;
	srun_job_t *job = NULL;
	allocation_info_t *ai = xmalloc(sizeof(*ai));
	hostlist_t hl = NULL;
	char *buf = NULL;
	int count = 0;
	uint32_t alloc_count = 0;

	ai->jobid          = job_id;
	ai->stepid         = NO_VAL;
	ai->nodelist = opt.alloc_nodelist;
	hl = hostlist_create(ai->nodelist);
	hostlist_uniq(hl);
	alloc_count = hostlist_count(hl);
	ai->nnodes = alloc_count;
	hostlist_destroy(hl);

	if (opt.exc_nodes) {
		hostlist_t exc_hl = hostlist_create(opt.exc_nodes);
		hostlist_t inc_hl = NULL;
		char *node_name = NULL;

		hl = hostlist_create(ai->nodelist);
		if(opt.nodelist) {
			inc_hl = hostlist_create(opt.nodelist);
		}
		hostlist_uniq(hl);
		//info("using %s or %s", opt.nodelist, ai->nodelist);
		while ((node_name = hostlist_shift(exc_hl))) {
			int inx = hostlist_find(hl, node_name);
			if (inx >= 0) {
				debug("excluding node %s", node_name);
				hostlist_delete_nth(hl, inx);
				ai->nnodes--;	/* decrement node count */
			}
			if(inc_hl) {
				inx = hostlist_find(inc_hl, node_name);
				if (inx >= 0) {
					error("Requested node %s is also "
					      "in the excluded list.",
					      node_name);
					error("Job not submitted.");
					hostlist_destroy(exc_hl);
					hostlist_destroy(inc_hl);
					goto error;
				}
			}
			free(node_name);
		}
		hostlist_destroy(exc_hl);

		/* we need to set this here so if there are more nodes
		 * available than we requested we can set it
		 * straight. If there is no exclude list then we set
		 * the vars then.
		 */
		if (!opt.nodes_set) {
			/* we don't want to set the number of nodes =
			 * to the number of requested processes unless we
			 * know it is less than the number of nodes
			 * in the allocation
			 */
			if(opt.ntasks_set && (opt.ntasks < ai->nnodes))
				opt.min_nodes = opt.ntasks;
			else
				opt.min_nodes = ai->nnodes;
			opt.nodes_set = true;
		}
		if(!opt.max_nodes)
			opt.max_nodes = opt.min_nodes;
		if((opt.max_nodes > 0) && (opt.max_nodes < ai->nnodes))
			ai->nnodes = opt.max_nodes;

		count = hostlist_count(hl);
		if(!count) {
			error("Hostlist is now nothing!  Can't run job.");
			hostlist_destroy(hl);
			goto error;
		}
		if(inc_hl) {
			count = hostlist_count(inc_hl);
			if(count < ai->nnodes) {
				/* add more nodes to get correct number for
				   allocation */
				hostlist_t tmp_hl = hostlist_copy(hl);
				int i=0;
				int diff = ai->nnodes - count;
				buf = hostlist_ranged_string_xmalloc(inc_hl);
				hostlist_delete(tmp_hl, buf);
				xfree(buf);
				while ((node_name = hostlist_shift(tmp_hl)) &&
				       (i < diff)) {
					hostlist_push(inc_hl, node_name);
					i++;
				}
				hostlist_destroy(tmp_hl);
			}
			buf = hostlist_ranged_string_xmalloc(inc_hl);
			hostlist_destroy(inc_hl);
			xfree(opt.nodelist);
			opt.nodelist = buf;
		} else {
			if (count > ai->nnodes) {
				/* remove more nodes than needed for
				   allocation */
				int i=0;
				for (i=count; i>ai->nnodes; i--)
					hostlist_delete_nth(hl, i);
			}
			xfree(opt.nodelist);
			opt.nodelist = hostlist_ranged_string_xmalloc(hl);
		}

		hostlist_destroy(hl);
	} else {
		if (!opt.nodes_set) {
			/* we don't want to set the number of nodes =
			 * to the number of requested processes unless we
			 * know it is less than the number of nodes
			 * in the allocation
			 */
			if(opt.ntasks_set && (opt.ntasks < ai->nnodes))
				opt.min_nodes = opt.ntasks;
			else
				opt.min_nodes = ai->nnodes;
			opt.nodes_set = true;
		}
		if(!opt.max_nodes)
			opt.max_nodes = opt.min_nodes;
		if((opt.max_nodes > 0) && (opt.max_nodes < ai->nnodes))
			ai->nnodes = opt.max_nodes;
		/* Don't reset the ai->nodelist because that is the
		 * nodelist we want to say the allocation is under
		 * opt.nodelist is what is used for the allocation.
		 */
		/* xfree(ai->nodelist); */
		/* ai->nodelist = xstrdup(buf); */
	}

	/* get the correct number of hosts to run tasks on */
	if (opt.nodelist) {
		hl = hostlist_create(opt.nodelist);
		if (opt.distribution != SLURM_DIST_ARBITRARY)
			hostlist_uniq(hl);
		if (!hostlist_count(hl)) {
			error("Hostlist is now nothing!  Can not run job.");
			hostlist_destroy(hl);
			goto error;
		}

		buf = hostlist_ranged_string_xmalloc(hl);
		count = hostlist_count(hl);
		hostlist_destroy(hl);
		/* Don't reset the ai->nodelist because that is the
		 * nodelist we want to say the allocation is under
		 * opt.nodelist is what is used for the allocation.
		 */
		/* xfree(ai->nodelist); */
		/* ai->nodelist = xstrdup(buf); */
		xfree(opt.nodelist);
		opt.nodelist = buf;
	}

	if (opt.distribution == SLURM_DIST_ARBITRARY) {
		if (count != opt.ntasks) {
			error("You asked for %d tasks but specified %d nodes",
			      opt.ntasks, count);
			goto error;
		}
	}

	if (ai->nnodes == 0) {
		error("No nodes in allocation, can't run job");
		goto error;
	}

	ai->num_cpu_groups = resp->num_cpu_groups;
	ai->cpus_per_node  = resp->cpus_per_node;
	ai->cpu_count_reps = resp->cpu_count_reps;

/* 	info("looking for %d nodes out of %s with a must list of %s", */
/* 	     ai->nnodes, ai->nodelist, opt.nodelist); */
	/*
	 * Create job
	 */
	job = _job_create_structure(ai);
error:
   	xfree(ai);
	return (job);

}