예제 #1
0
struct ipmipower_connection *
ipmipower_connection_array_create(const char *hostname, unsigned int *len) 
{
  char *str = NULL;
  int index = 0;
  hostlist_t hl = NULL;
  hostlist_iterator_t itr = NULL;
  struct ipmipower_connection *ics;
  int size = sizeof(struct ipmipower_connection);
  int hl_count;
  int errcount = 0;
  int emfilecount = 0;

  assert(hostname && len); 

  *len = 0;
  
  if (!(hl = hostlist_create(hostname)))
    {
      ipmipower_output(MSG_TYPE_HOSTNAME_INVALID, hostname);
      return NULL;
    }
  
  if (!(itr = hostlist_iterator_create(hl)))
    ierr_exit("hostlist_iterator_create() error"); 
  
  hostlist_uniq(hl);

  hl_count = hostlist_count(hl);

  ics = (struct ipmipower_connection *)Malloc(size * hl_count);
  
  memset(ics, '\0', (size * hl_count));
  
  while ((str = hostlist_next(itr))) 
    {
      ics[index].ipmi_fd = -1;
      ics[index].ping_fd = -1;
      
      /* cleanup only at the end, gather all error outputs for
       * later 
       */
      if (_connection_setup(&ics[index], str) < 0) 
        {
          if (errno == EMFILE && !emfilecount)
            {
              cbuf_printf(ttyout, "file descriptor limit reached\n");
              emfilecount++;
            }
          errcount++;
        }
       
      free(str);
      index++;
    }

  hostlist_iterator_destroy(itr);
  hostlist_destroy(hl);

  if (errcount)
    {
      int i;
      for (i = 0; i < hl_count; i++) 
        {
          close(ics[i].ipmi_fd);
          close(ics[i].ping_fd);
          if (ics[i].ipmi_in)
            cbuf_destroy(ics[i].ipmi_in);
          if (ics[i].ipmi_out)
            cbuf_destroy(ics[i].ipmi_out);
          if (ics[i].ping_in)
            cbuf_destroy(ics[i].ping_in);
          if (ics[i].ping_out)
            cbuf_destroy(ics[i].ping_out);
        }
      Free(ics);
      return NULL;
    }

  *len = hl_count;
  return ics;
}
예제 #2
0
extern int sacctmgr_list_cluster(int argc, char *argv[])
{
	int rc = SLURM_SUCCESS;
	slurmdb_cluster_cond_t *cluster_cond =
		xmalloc(sizeof(slurmdb_cluster_cond_t));
	List cluster_list;
	int i=0;
	ListIterator itr = NULL;
	ListIterator itr2 = NULL;
	slurmdb_cluster_rec_t *cluster = NULL;
	char *tmp_char = NULL;

	int field_count = 0;

	print_field_t *field = NULL;

	List format_list = list_create(slurm_destroy_char);
	List print_fields_list; /* types are of print_field_t */

	slurmdb_init_cluster_cond(cluster_cond, 0);
	cluster_cond->cluster_list = list_create(slurm_destroy_char);
	for (i=0; i<argc; i++) {
		int command_len = strlen(argv[i]);
		if (!strncasecmp(argv[i], "Where", MAX(command_len, 5))
		    || !strncasecmp(argv[i], "Set", MAX(command_len, 3)))
			i++;
		_set_cond(&i, argc, argv, cluster_cond, format_list);
	}

	if(exit_code) {
		slurmdb_destroy_cluster_cond(cluster_cond);
		list_destroy(format_list);
		return SLURM_ERROR;
	}

	if(!list_count(format_list)) {
		slurm_addto_char_list(format_list,
				      "Cl,Controlh,Controlp,RPC");
		if(!without_limits)
			slurm_addto_char_list(format_list,
					      "Fa,GrpJ,GrpN,GrpS,MaxJ,MaxN,"
					      "MaxS,MaxW,QOS,DefaultQOS");
	}

	cluster_cond->with_deleted = with_deleted;

	print_fields_list = sacctmgr_process_format_list(format_list);
	list_destroy(format_list);

	if(exit_code) {
		slurmdb_destroy_cluster_cond(cluster_cond);
		list_destroy(print_fields_list);
		return SLURM_ERROR;
	}

	cluster_list = acct_storage_g_get_clusters(db_conn, my_uid,
						   cluster_cond);
	slurmdb_destroy_cluster_cond(cluster_cond);

	if(!cluster_list) {
		exit_code=1;
		fprintf(stderr, " Problem with query.\n");
		list_destroy(print_fields_list);
		return SLURM_ERROR;
	}

	itr = list_iterator_create(cluster_list);
	itr2 = list_iterator_create(print_fields_list);
	print_fields_header(print_fields_list);

	field_count = list_count(print_fields_list);

	while((cluster = list_next(itr))) {
		int curr_inx = 1;
		slurmdb_association_rec_t *assoc = cluster->root_assoc;
		/* set up the working cluster rec so nodecnt's and node names
		 * are handled correctly */
		working_cluster_rec = cluster;
		while((field = list_next(itr2))) {
			switch(field->type) {
			case PRINT_CLUSTER:
				field->print_routine(field,
						     cluster->name,
						     (curr_inx == field_count));
				break;
			case PRINT_CHOST:
				field->print_routine(field,
						     cluster->control_host,
						     (curr_inx == field_count));
				break;
			case PRINT_CPORT:
				field->print_routine(field,
						     cluster->control_port,
						     (curr_inx == field_count));
				break;
			case PRINT_CLASS:
				field->print_routine(field,
						     get_classification_str(
							     cluster->
							     classification),
						     (curr_inx == field_count));
				break;
			case PRINT_CPUS:
			{
				char tmp_char[9];
				convert_num_unit((float)cluster->cpu_count,
						 tmp_char, sizeof(tmp_char),
						 UNIT_NONE);
				field->print_routine(field,
						     tmp_char,
						     (curr_inx == field_count));
				break;
			}
			case PRINT_DQOS:
				if(!g_qos_list) {
					g_qos_list = acct_storage_g_get_qos(
						db_conn,
						my_uid,
						NULL);
				}
				tmp_char = slurmdb_qos_str(g_qos_list,
							   assoc->def_qos_id);
				field->print_routine(
					field,
					tmp_char,
					(curr_inx == field_count));
				break;
			case PRINT_FAIRSHARE:
				field->print_routine(
					field,
					assoc->shares_raw,
					(curr_inx == field_count));
				break;
			case PRINT_FLAGS:
			{
				char *tmp_char = slurmdb_cluster_flags_2_str(
					cluster->flags);
				field->print_routine(
					field,
					tmp_char,
					(curr_inx == field_count));
				xfree(tmp_char);
				break;
			}
			case PRINT_GRPC:
				field->print_routine(field,
						     assoc->grp_cpus,
						     (curr_inx == field_count));
				break;
			case PRINT_GRPJ:
				field->print_routine(field,
						     assoc->grp_jobs,
						     (curr_inx == field_count));
				break;
			case PRINT_GRPN:
				field->print_routine(field,
						     assoc->grp_nodes,
						     (curr_inx == field_count));
				break;
			case PRINT_GRPS:
				field->print_routine(field,
						     assoc->grp_submit_jobs,
						     (curr_inx == field_count));
				break;
			case PRINT_MAXCM:
				field->print_routine(
					field,
					assoc->max_cpu_mins_pj,
					(curr_inx == field_count));
				break;
			case PRINT_MAXC:
				field->print_routine(field,
						     assoc->max_cpus_pj,
						     (curr_inx == field_count));
				break;
			case PRINT_MAXJ:
				field->print_routine(field,
						     assoc->max_jobs,
						     (curr_inx == field_count));
				break;
			case PRINT_MAXN:
				field->print_routine(field,
						     assoc->max_nodes_pj,
						     (curr_inx == field_count));
				break;
			case PRINT_MAXS:
				field->print_routine(field,
						     assoc->max_submit_jobs,
						     (curr_inx == field_count));
				break;
			case PRINT_MAXW:
				field->print_routine(
					field,
					assoc->max_wall_pj,
					(curr_inx == field_count));
				break;

			case PRINT_NODECNT:
			{
				hostlist_t hl = hostlist_create(cluster->nodes);
				int cnt = 0;
				if(hl) {
					cnt = hostlist_count(hl);
					hostlist_destroy(hl);
				}
				field->print_routine(
					field,
					cnt,
					(curr_inx == field_count));
				break;
			}
			case PRINT_CLUSTER_NODES:
				field->print_routine(
					field,
					cluster->nodes,
					(curr_inx == field_count));
				break;
			case PRINT_QOS:
				if(!g_qos_list)
					g_qos_list = acct_storage_g_get_qos(
						db_conn, my_uid, NULL);

				field->print_routine(field,
						     g_qos_list,
						     assoc->qos_list,
						     (curr_inx == field_count));
				break;
			case PRINT_QOS_RAW:
				field->print_routine(field,
						     assoc->qos_list,
						     (curr_inx == field_count));
				break;
			case PRINT_RPC_VERSION:
				field->print_routine(
					field,
					cluster->rpc_version,
					(curr_inx == field_count));
				break;
			case PRINT_SELECT:
				field->print_routine(
					field,
					cluster->plugin_id_select,
					(curr_inx == field_count));
				break;
			default:
				field->print_routine(
					field, NULL,
					(curr_inx == field_count));
				break;
			}
			curr_inx++;
		}
		list_iterator_reset(itr2);
		printf("\n");
	}
	/* clear the working cluster rec */
	working_cluster_rec = NULL;

	list_iterator_destroy(itr2);
	list_iterator_destroy(itr);
	list_destroy(cluster_list);
	list_destroy(print_fields_list);

	return rc;
}
예제 #3
0
extern List setup_cluster_list_with_inx(mysql_conn_t *mysql_conn,
                                        slurmdb_job_cond_t *job_cond,
                                        void **curr_cluster)
{
    List local_cluster_list = NULL;
    time_t now = time(NULL);
    MYSQL_RES *result = NULL;
    MYSQL_ROW row;
    hostlist_t temp_hl = NULL;
    hostlist_iterator_t h_itr = NULL;
    char *query = NULL;
    int dims = 0;

    if (!job_cond || !job_cond->used_nodes)
        return NULL;

    if (!job_cond->cluster_list
            || list_count(job_cond->cluster_list) != 1) {
        error("If you are doing a query against nodes "
              "you must only have 1 cluster "
              "you are asking for.");
        return NULL;
    }

    /* get the dimensions of this cluster so we know how to deal
       with the hostlists */
    query = xstrdup_printf("select dimensions from %s where name='%s'",
                           cluster_table,
                           (char *)list_peek(job_cond->cluster_list));

    debug4("%d(%s:%d) query\n%s",
           mysql_conn->conn, THIS_FILE, __LINE__, query);
    if (!(result = mysql_db_query_ret(mysql_conn, query, 0))) {
        xfree(query);
        return NULL;
    }
    xfree(query);

    if (!(row = mysql_fetch_row(result))) {
        error("Couldn't get the dimensions of cluster '%s'.",
              (char *)list_peek(job_cond->cluster_list));
        return NULL;
    }
    dims = atoi(row[0]);

    temp_hl = hostlist_create_dims(job_cond->used_nodes, dims);
    if (hostlist_count(temp_hl) <= 0) {
        error("we didn't get any real hosts to look for.");
        goto no_hosts;
    }
    h_itr = hostlist_iterator_create(temp_hl);

    query = xstrdup_printf("select cluster_nodes, time_start, "
                           "time_end from \"%s_%s\" where node_name='' "
                           "&& cluster_nodes !=''",
                           (char *)list_peek(job_cond->cluster_list),
                           event_table);

    if (job_cond->usage_start) {
        if (!job_cond->usage_end)
            job_cond->usage_end = now;

        xstrfmtcat(query,
                   " && ((time_start < %ld) "
                   "&& (time_end >= %ld || time_end = 0))",
                   job_cond->usage_end, job_cond->usage_start);
    }

    debug3("%d(%s:%d) query\n%s",
           mysql_conn->conn, THIS_FILE, __LINE__, query);
    if (!(result = mysql_db_query_ret(mysql_conn, query, 0))) {
        xfree(query);
        goto no_hosts;
    }
    xfree(query);

    local_cluster_list = list_create(_destroy_local_cluster);
    while ((row = mysql_fetch_row(result))) {
        char *host = NULL;
        int loc = 0;
        local_cluster_t *local_cluster =
            xmalloc(sizeof(local_cluster_t));
        local_cluster->hl = hostlist_create_dims(row[0], dims);
        local_cluster->start = slurm_atoul(row[1]);
        local_cluster->end   = slurm_atoul(row[2]);
        local_cluster->asked_bitmap =
            bit_alloc(hostlist_count(local_cluster->hl));
        while ((host = hostlist_next_dims(h_itr, dims))) {
            if ((loc = hostlist_find(
                           local_cluster->hl, host)) != -1)
                bit_set(local_cluster->asked_bitmap, loc);
            free(host);
        }
        hostlist_iterator_reset(h_itr);
        if (bit_ffs(local_cluster->asked_bitmap) != -1) {
            list_append(local_cluster_list, local_cluster);
            if (local_cluster->end == 0) {
                local_cluster->end = now;
                (*curr_cluster) = local_cluster;
            }
        } else
            _destroy_local_cluster(local_cluster);
    }
    mysql_free_result(result);

    if (!list_count(local_cluster_list)) {
        list_destroy(local_cluster_list);
        local_cluster_list = NULL;
        goto no_hosts;
    }

no_hosts:

    hostlist_iterator_destroy(h_itr);
    hostlist_destroy(temp_hl);

    return local_cluster_list;
}
예제 #4
0
static int _resources_set(char ***env)
{
	char *p = NULL;

	/* Initialize all memory pointers that would be allocated to NULL
	 * So in case of error exit we will know what to xfree
	 */
	_pmixp_job_info.job_hl = hostlist_create("");
	_pmixp_job_info.step_hl = hostlist_create("");
	_pmixp_job_info.hostname = NULL;

	/* Save step host list */
	p = getenvp(*env, PMIXP_STEP_NODES_ENV);
	if (!p) {
		PMIXP_ERROR_NO(ENOENT, "Environment variable %s not found",
				PMIXP_STEP_NODES_ENV);
		goto err_exit;
	}
	hostlist_push(_pmixp_job_info.step_hl, p);

	/* Extract our node name */
	p = hostlist_nth(_pmixp_job_info.step_hl, _pmixp_job_info.node_id);
	_pmixp_job_info.hostname = xstrdup(p);
	free(p);

	/* Determine job-wide node id and job-wide node count */
	p = getenvp(*env, PMIXP_JOB_NODES_ENV);
	if (p == NULL) {
		p = getenvp(*env, PMIXP_JOB_NODES_ENV_DEP);
		if (p == NULL) {
			/* shouldn't happen if we are under SLURM! */
			PMIXP_ERROR_NO(ENOENT, "Neither of nodelist environment variables: %s OR %s was found!",
					PMIXP_JOB_NODES_ENV, PMIXP_JOB_NODES_ENV_DEP);
			goto err_exit;
		}
	}
	hostlist_push(_pmixp_job_info.job_hl, p);
	_pmixp_job_info.nnodes_job = hostlist_count(_pmixp_job_info.job_hl);
	_pmixp_job_info.node_id_job = hostlist_find(_pmixp_job_info.job_hl,
			_pmixp_job_info.hostname);

	/* FIXME!! ------------------------------------------------------------- */
	/* TODO: _get_task_count not always works well.
	 if (_get_task_count(env, &_pmixp_job_info.ntasks_job, &_pmixp_job_info.ncpus_job) < 0) {
	 _pmixp_job_info.ntasks_job  = _pmixp_job_info.ntasks;
	 _pmixp_job_info.ncpus_job  = _pmixp_job_info.ntasks;
	 }
	 xassert(_pmixp_job_info.ntasks <= _pmixp_job_info.ntasks_job);
	 */
	_pmixp_job_info.ntasks_job = _pmixp_job_info.ntasks;
	_pmixp_job_info.ncpus_job = _pmixp_job_info.ntasks;

	/* Save task-to-node mapping */
	p = getenvp(*env, PMIXP_SLURM_MAPPING_ENV);
	if (p == NULL) {
		/* Direct modex won't work */
		PMIXP_ERROR_NO(ENOENT, "No %s environment variable found!",
				PMIXP_SLURM_MAPPING_ENV);
		goto err_exit;
	}

	_pmixp_job_info.task_map_packed = xstrdup(p);

	return SLURM_SUCCESS;
err_exit:
	hostlist_destroy(_pmixp_job_info.job_hl);
	hostlist_destroy(_pmixp_job_info.step_hl);
	if (NULL != _pmixp_job_info.hostname) {
		xfree(_pmixp_job_info.hostname);
	}
	return SLURM_ERROR;
}
예제 #5
0
/*
 * Read a Slurm hostfile specified by "filename".  "filename" must contain
 * a list of Slurm NodeNames, one per line.  Reads up to "n" number of hostnames
 * from the file. Returns a string representing a hostlist ranged string of
 * the contents of the file.  This is a helper function, it does not
 * contact any Slurm daemons.
 *
 * Returns a string representing the hostlist.  Returns NULL if there are fewer
 * than "n" hostnames in the file, or if an error occurs.  If "n" ==
 * NO_VAL then the entire file is read in
 *
 * Returned string must be freed with free().
 */
char *slurm_read_hostfile(const char *filename, int n)
{
	FILE *fp = NULL;
	char in_line[BUFFER_SIZE];	/* input line */
	int i, j;
	int line_size;
	int line_num = 0;
	hostlist_t hostlist = NULL;
	char *nodelist = NULL, *end_part = NULL;
	char *asterisk, *tmp_text = NULL, *save_ptr = NULL, *host_name;
	int total_file_len = 0;

	if (filename == NULL || strlen(filename) == 0)
		return NULL;

	if ((fp = fopen(filename, "r")) == NULL) {
		error("slurm_allocate_resources error opening file %s, %m",
		      filename);
		return NULL;
	}

	hostlist = hostlist_create(NULL);
	if (hostlist == NULL) {
		fclose(fp);
		return NULL;
	}

	while (fgets(in_line, BUFFER_SIZE, fp) != NULL) {

		line_size = strlen(in_line);
		for (i = 0; i < line_size; i++) {
			if (in_line[i] == '\n') {
				in_line[i] = '\0';
				break;
			}
			if (in_line[i] == '\0')
				break;
			if (in_line[i] != '#')
				continue;
			if ((i > 0) && (in_line[i - 1] == '\\')) {
				for (j = i; j < line_size; j++) {
					in_line[j - 1] = in_line[j];
				}
				line_size--;
				continue;
			}
			in_line[i] = '\0';
			break;
		}

		/*
		 * Get the string length again just to in case it changed from
		 * the above loop
		 */
		line_size = strlen(in_line);
		total_file_len += line_size;

		/*
		 * If there was an end section from before set it up to be on
		 * the front of this next chunk.
		 */
		if (end_part) {
			tmp_text = end_part;
			end_part = NULL;
		}

		if (line_size == (BUFFER_SIZE - 1)) {
			/*
			 * If we filled up the buffer get the end past the last
			 * comma.  We will tack it on the next pass through.
			 */
			char *last_comma = strrchr(in_line, ',');
			if (!last_comma) {
				error("Line %d, of hostfile %s too long",
				      line_num, filename);
				fclose(fp);
				hostlist_destroy(hostlist);
				return NULL;
			}
			end_part = xstrdup(last_comma + 1);
			*last_comma = '\0';
		} else
			line_num++;

		xstrcat(tmp_text, in_line);

		/* Skip this line */
		if (tmp_text[0] == '\0')
			continue;

		if (!isalpha(tmp_text[0]) && !isdigit(tmp_text[0])) {
			error("Invalid hostfile %s contents on line %d",
			      filename, line_num);
			fclose(fp);
			hostlist_destroy(hostlist);
			xfree(end_part);
			xfree(tmp_text);
			return NULL;
		}

		host_name = strtok_r(tmp_text, ",", &save_ptr);
		while (host_name) {
			if ((asterisk = strchr(host_name, '*')) &&
			    (i = atoi(asterisk + 1))) {
				asterisk[0] = '\0';

				/*
				 * Don't forget the extra space potentially
				 * needed
				 */
				total_file_len += strlen(host_name) * i;

				for (j = 0; j < i; j++)
					hostlist_push_host(hostlist, host_name);
			} else {
				hostlist_push_host(hostlist, host_name);
			}
			host_name = strtok_r(NULL, ",", &save_ptr);
		}
		xfree(tmp_text);

		if ((n != (int)NO_VAL) && (hostlist_count(hostlist) == n))
			break;
	}
	fclose(fp);

	if (hostlist_count(hostlist) <= 0) {
		error("Hostlist is empty!");
		goto cleanup_hostfile;
	}
	if (hostlist_count(hostlist) < n) {
		error("Too few NodeNames in Slurm Hostfile");
		goto cleanup_hostfile;
	}

	total_file_len += 1024;
	nodelist = (char *)malloc(total_file_len);
	if (!nodelist) {
		error("Nodelist xmalloc failed");
		goto cleanup_hostfile;
	}

	if (hostlist_ranged_string(hostlist, total_file_len, nodelist) == -1) {
		error("Hostlist is too long for the allocate RPC!");
		free(nodelist);
		nodelist = NULL;
		goto cleanup_hostfile;
	}

	debug2("Hostlist from SLURM_HOSTFILE = %s", nodelist);

cleanup_hostfile:
	hostlist_destroy(hostlist);
	xfree(end_part);
	xfree(tmp_text);

	return nodelist;
}
예제 #6
0
파일: job_info.c 프로젝트: IFCA/slurm
/*
 * slurm_sprint_job_info - output information about a specific Slurm
 *	job based upon message as loaded using slurm_load_jobs
 * IN job_ptr - an individual job information record pointer
 * IN one_liner - print as a single line if true
 * RET out - char * containing formatted output (must be freed after call)
 *           NULL is returned on failure.
 */
extern char *
slurm_sprint_job_info ( job_info_t * job_ptr, int one_liner )
{
	int i, j;
	char time_str[32], *group_name, *user_name;
	char tmp1[128], tmp2[128], tmp3[128], tmp4[128], tmp5[128], *tmp6_ptr;
	char tmp_line[512];
	char *ionodes = NULL;
	uint16_t exit_status = 0, term_sig = 0;
	job_resources_t *job_resrcs = job_ptr->job_resrcs;
	char *out = NULL;
	time_t run_time;
	uint32_t min_nodes, max_nodes = 0;
	char *nodelist = "NodeList";
	bitstr_t *core_bitmap;
	char *host;
	int sock_inx, sock_reps, last;
	int abs_node_inx, rel_node_inx;
	int bit_inx, bit_reps;
	uint32_t *last_mem_alloc_ptr = NULL;
	uint32_t last_mem_alloc = NO_VAL;
	char *last_hosts;
	hostlist_t hl, hl_last;
	char select_buf[122];
	uint32_t cluster_flags = slurmdb_setup_cluster_flags();

	if (cluster_flags & CLUSTER_FLAG_BG) {
		nodelist = "MidplaneList";
		select_g_select_jobinfo_get(job_ptr->select_jobinfo,
					    SELECT_JOBDATA_IONODES,
					    &ionodes);
	}

	/****** Line 1 ******/
	snprintf(tmp_line, sizeof(tmp_line), "JobId=%u ", job_ptr->job_id);
	out = xstrdup(tmp_line);
	if (job_ptr->array_job_id) {
		snprintf(tmp_line, sizeof(tmp_line), 
			 "ArrayJobId=%u ArrayTaskId=%u ",
			 job_ptr->array_job_id, job_ptr->array_task_id);
		xstrcat(out, tmp_line);
	}
	snprintf(tmp_line, sizeof(tmp_line), "Name=%s", job_ptr->name);
	xstrcat(out, tmp_line);
	if (one_liner)
		xstrcat(out, " ");
	else
		xstrcat(out, "\n   ");

	/****** Line 2 ******/
	user_name = uid_to_string((uid_t) job_ptr->user_id);
	group_name = gid_to_string((gid_t) job_ptr->group_id);
	snprintf(tmp_line, sizeof(tmp_line),
		 "UserId=%s(%u) GroupId=%s(%u)",
		 user_name, job_ptr->user_id, group_name, job_ptr->group_id);
	xfree(user_name);
	xfree(group_name);
	xstrcat(out, tmp_line);
	if (one_liner)
		xstrcat(out, " ");
	else
		xstrcat(out, "\n   ");

	/****** Line 3 ******/
	snprintf(tmp_line, sizeof(tmp_line),
		 "Priority=%u Account=%s QOS=%s",
		 job_ptr->priority, job_ptr->account, job_ptr->qos);
	xstrcat(out, tmp_line);
	if (slurm_get_track_wckey()) {
		snprintf(tmp_line, sizeof(tmp_line),
			 " WCKey=%s", job_ptr->wckey);
		xstrcat(out, tmp_line);
	}
	if (one_liner)
		xstrcat(out, " ");
	else
		xstrcat(out, "\n   ");

	/****** Line 4 ******/
	if (job_ptr->state_desc) {
		/* Replace white space with underscore for easier parsing */
		for (j=0; job_ptr->state_desc[j]; j++) {
			if (isspace((int)job_ptr->state_desc[j]))
				job_ptr->state_desc[j] = '_';
		}
		tmp6_ptr = job_ptr->state_desc;
	} else
		tmp6_ptr = job_reason_string(job_ptr->state_reason);
	snprintf(tmp_line, sizeof(tmp_line),
		 "JobState=%s Reason=%s Dependency=%s",
		 job_state_string(job_ptr->job_state), tmp6_ptr,
		 job_ptr->dependency);
	xstrcat(out, tmp_line);
	if (one_liner)
		xstrcat(out, " ");
	else
		xstrcat(out, "\n   ");

	/****** Line 5 ******/
	snprintf(tmp_line, sizeof(tmp_line),
		 "Requeue=%u Restarts=%u BatchFlag=%u ",
		 job_ptr->requeue, job_ptr->restart_cnt, job_ptr->batch_flag);
	xstrcat(out, tmp_line);
	if (WIFSIGNALED(job_ptr->exit_code))
		term_sig = WTERMSIG(job_ptr->exit_code);
	exit_status = WEXITSTATUS(job_ptr->exit_code);
	snprintf(tmp_line, sizeof(tmp_line),
		 "ExitCode=%u:%u", exit_status, term_sig);
	xstrcat(out, tmp_line);
	if (one_liner)
		xstrcat(out, " ");
	else
		xstrcat(out, "\n   ");

	/****** Line 5a (optional) ******/
	if (!(job_ptr->show_flags & SHOW_DETAIL))
		goto line6;
	if (WIFSIGNALED(job_ptr->derived_ec))
		term_sig = WTERMSIG(job_ptr->derived_ec);
	else
		term_sig = 0;
	exit_status = WEXITSTATUS(job_ptr->derived_ec);
	snprintf(tmp_line, sizeof(tmp_line),
		 "DerivedExitCode=%u:%u", exit_status, term_sig);
	xstrcat(out, tmp_line);
	if (one_liner)
		xstrcat(out, " ");
	else
		xstrcat(out, "\n   ");

	/****** Line 6 ******/
line6:
	snprintf(tmp_line, sizeof(tmp_line), "RunTime=");
	xstrcat(out, tmp_line);
	if (IS_JOB_PENDING(job_ptr))
		run_time = 0;
	else if (IS_JOB_SUSPENDED(job_ptr))
		run_time = job_ptr->pre_sus_time;
	else {
		time_t end_time;
		if (IS_JOB_RUNNING(job_ptr) || (job_ptr->end_time == 0))
			end_time = time(NULL);
		else
			end_time = job_ptr->end_time;
		if (job_ptr->suspend_time) {
			run_time = (time_t)
				(difftime(end_time, job_ptr->suspend_time)
				 + job_ptr->pre_sus_time);
		} else
			run_time = (time_t)
				difftime(end_time, job_ptr->start_time);
	}
	secs2time_str(run_time, tmp1, sizeof(tmp1));
	sprintf(tmp_line, "%s ", tmp1);
	xstrcat(out, tmp_line);

	snprintf(tmp_line, sizeof(tmp_line), "TimeLimit=");
	xstrcat(out, tmp_line);
	if (job_ptr->time_limit == NO_VAL)
		sprintf(tmp_line, "Partition_Limit");
	else {
		mins2time_str(job_ptr->time_limit, tmp_line,
			      sizeof(tmp_line));
	}
	xstrcat(out, tmp_line);
	snprintf(tmp_line, sizeof(tmp_line), " TimeMin=");
	xstrcat(out, tmp_line);
	if (job_ptr->time_min == 0)
		sprintf(tmp_line, "N/A");
	else {
		mins2time_str(job_ptr->time_min, tmp_line,
			      sizeof(tmp_line));
	}
	xstrcat(out, tmp_line);
	if (one_liner)
		xstrcat(out, " ");
	else
		xstrcat(out, "\n   ");

	/****** Line 7 ******/
	slurm_make_time_str((time_t *)&job_ptr->submit_time, time_str,
			    sizeof(time_str));
	snprintf(tmp_line, sizeof(tmp_line), "SubmitTime=%s ", time_str);
	xstrcat(out, tmp_line);

	slurm_make_time_str((time_t *)&job_ptr->eligible_time, time_str,
			    sizeof(time_str));
	snprintf(tmp_line, sizeof(tmp_line), "EligibleTime=%s", time_str);
	xstrcat(out, tmp_line);
	if (one_liner)
		xstrcat(out, " ");
	else
		xstrcat(out, "\n   ");

	/****** Line 8 (optional) ******/
	if (job_ptr->resize_time) {
		slurm_make_time_str((time_t *)&job_ptr->resize_time, time_str,
				    sizeof(time_str));
		snprintf(tmp_line, sizeof(tmp_line), "ResizeTime=%s", time_str);
		xstrcat(out, tmp_line);
		if (one_liner)
			xstrcat(out, " ");
		else
			xstrcat(out, "\n   ");
	}

	/****** Line 9 ******/
	slurm_make_time_str((time_t *)&job_ptr->start_time, time_str,
			    sizeof(time_str));
	snprintf(tmp_line, sizeof(tmp_line), "StartTime=%s ", time_str);
	xstrcat(out, tmp_line);

	snprintf(tmp_line, sizeof(tmp_line), "EndTime=");
	xstrcat(out, tmp_line);
	if ((job_ptr->time_limit == INFINITE) &&
	    (job_ptr->end_time > time(NULL)))
		sprintf(tmp_line, "Unknown");
	else {
		slurm_make_time_str ((time_t *)&job_ptr->end_time, time_str,
				     sizeof(time_str));
		sprintf(tmp_line, "%s", time_str);
	}
	xstrcat(out, tmp_line);
	if (one_liner)
		xstrcat(out, " ");
	else
		xstrcat(out, "\n   ");

	/****** Line 10 ******/
	if (job_ptr->preempt_time == 0)
		sprintf(tmp_line, "PreemptTime=None ");
	else {
		slurm_make_time_str((time_t *)&job_ptr->preempt_time,
				    time_str, sizeof(time_str));
		snprintf(tmp_line, sizeof(tmp_line), "PreemptTime=%s ",
			 time_str);
	}
	xstrcat(out, tmp_line);
	if (job_ptr->suspend_time) {
		slurm_make_time_str ((time_t *)&job_ptr->suspend_time,
				     time_str, sizeof(time_str));
	} else {
		strncpy(time_str, "None", sizeof(time_str));
	}
	snprintf(tmp_line, sizeof(tmp_line),
		 "SuspendTime=%s SecsPreSuspend=%ld",
		 time_str, (long int)job_ptr->pre_sus_time);
	xstrcat(out, tmp_line);
	if (one_liner)
		xstrcat(out, " ");
	else
		xstrcat(out, "\n   ");

	/****** Line 11 ******/
	snprintf(tmp_line, sizeof(tmp_line),
		 "Partition=%s AllocNode:Sid=%s:%u",
		 job_ptr->partition, job_ptr->alloc_node, job_ptr->alloc_sid);
	xstrcat(out, tmp_line);
	if (one_liner)
		xstrcat(out, " ");
	else
		xstrcat(out, "\n   ");

	/****** Line 12 ******/
	snprintf(tmp_line, sizeof(tmp_line), "Req%s=%s Exc%s=%s",
		 nodelist, job_ptr->req_nodes, nodelist, job_ptr->exc_nodes);
	xstrcat(out, tmp_line);
	if (one_liner)
		xstrcat(out, " ");
	else
		xstrcat(out, "\n   ");

	/****** Line 13 ******/
	xstrfmtcat(out, "%s=", nodelist);
	xstrcat(out, job_ptr->nodes);
	if (job_ptr->nodes && ionodes) {
		snprintf(tmp_line, sizeof(tmp_line), "[%s]", ionodes);
		xstrcat(out, tmp_line);
		xfree(ionodes);
	}
	if (one_liner)
		xstrcat(out, " ");
	else
		xstrcat(out, "\n   ");

	/****** Line 14 (optional) ******/
	if (job_ptr->batch_host) {
		snprintf(tmp_line, sizeof(tmp_line), "BatchHost=%s",
			 job_ptr->batch_host);
		xstrcat(out, tmp_line);
		if (one_liner)
			xstrcat(out, " ");
		else
			xstrcat(out, "\n   ");
	}

	/****** Line 15 ******/
	if (cluster_flags & CLUSTER_FLAG_BG) {
		select_g_select_jobinfo_get(job_ptr->select_jobinfo,
					    SELECT_JOBDATA_NODE_CNT,
					    &min_nodes);
		if ((min_nodes == 0) || (min_nodes == NO_VAL)) {
			min_nodes = job_ptr->num_nodes;
			max_nodes = job_ptr->max_nodes;
		} else if (job_ptr->max_nodes)
			max_nodes = min_nodes;
	} else {
		min_nodes = job_ptr->num_nodes;
		max_nodes = job_ptr->max_nodes;
	}

	_sprint_range(tmp1, sizeof(tmp1), job_ptr->num_cpus, job_ptr->max_cpus);
	_sprint_range(tmp2, sizeof(tmp2), min_nodes, max_nodes);
	if (job_ptr->sockets_per_node == (uint16_t) NO_VAL)
		strcpy(tmp3, "*");
	else
		snprintf(tmp3, sizeof(tmp3), "%u", job_ptr->sockets_per_node);
	if (job_ptr->cores_per_socket == (uint16_t) NO_VAL)
		strcpy(tmp4, "*");
	else
		snprintf(tmp4, sizeof(tmp4), "%u", job_ptr->cores_per_socket);
	if (job_ptr->threads_per_core == (uint16_t) NO_VAL)
		strcpy(tmp5, "*");
	else
		snprintf(tmp5, sizeof(tmp5), "%u", job_ptr->threads_per_core);
	snprintf(tmp_line, sizeof(tmp_line),
		 "NumNodes=%s NumCPUs=%s CPUs/Task=%u ReqS:C:T=%s:%s:%s",
		 tmp2, tmp1, job_ptr->cpus_per_task, tmp3, tmp4, tmp5);
	xstrcat(out, tmp_line);
	if (one_liner)
		xstrcat(out, " ");
	else
		xstrcat(out, "\n   ");

	if (!job_resrcs)
		goto line15;

	if (cluster_flags & CLUSTER_FLAG_BG) {
		if ((job_resrcs->cpu_array_cnt > 0) &&
		    (job_resrcs->cpu_array_value) &&
		    (job_resrcs->cpu_array_reps)) {
			int length = 0;
			xstrcat(out, "CPUs=");
			length += 10;
			for (i = 0; i < job_resrcs->cpu_array_cnt; i++) {
				if (length > 70) {
					/* skip to last CPU group entry */
					if (i < job_resrcs->cpu_array_cnt - 1) {
						continue;
					}
					/* add ellipsis before last entry */
					xstrcat(out, "...,");
					length += 4;
				}

				snprintf(tmp_line, sizeof(tmp_line), "%d",
					 job_resrcs->cpus[i]);
				xstrcat(out, tmp_line);
				length += strlen(tmp_line);
				if (job_resrcs->cpu_array_reps[i] > 1) {
					snprintf(tmp_line, sizeof(tmp_line),
						 "*%d",
						 job_resrcs->cpu_array_reps[i]);
					xstrcat(out, tmp_line);
					length += strlen(tmp_line);
				}
				if (i < job_resrcs->cpu_array_cnt - 1) {
					xstrcat(out, ",");
					length++;
				}
			}
			if (one_liner)
				xstrcat(out, " ");
			else
				xstrcat(out, "\n   ");
		}
	} else {
		if (!job_resrcs->core_bitmap)
			goto line15;

		last  = bit_fls(job_resrcs->core_bitmap);
		if (last == -1)
			goto line15;

		hl = hostlist_create(job_ptr->nodes);
		if (!hl) {
			error("slurm_sprint_job_info: hostlist_create: %s",
			      job_ptr->nodes);
			return NULL;
		}
		hl_last = hostlist_create(NULL);
		if (!hl_last) {
			error("slurm_sprint_job_info: hostlist_create: NULL");
			hostlist_destroy(hl);
			return NULL;
		}

		bit_inx = 0;
		i = sock_inx = sock_reps = 0;
		abs_node_inx = job_ptr->node_inx[i];

/*	tmp1[] stores the current cpu(s) allocated	*/
		tmp2[0] = '\0';	/* stores last cpu(s) allocated */
		for (rel_node_inx=0; rel_node_inx < job_resrcs->nhosts;
		     rel_node_inx++) {

			if (sock_reps >=
			    job_resrcs->sock_core_rep_count[sock_inx]) {
				sock_inx++;
				sock_reps = 0;
			}
			sock_reps++;

			bit_reps = job_resrcs->sockets_per_node[sock_inx] *
				job_resrcs->cores_per_socket[sock_inx];

			core_bitmap = bit_alloc(bit_reps);
			for (j=0; j < bit_reps; j++) {
				if (bit_test(job_resrcs->core_bitmap, bit_inx))
					bit_set(core_bitmap, j);
				bit_inx++;
			}

			bit_fmt(tmp1, sizeof(tmp1), core_bitmap);
			FREE_NULL_BITMAP(core_bitmap);
			host = hostlist_shift(hl);
/*
 *		If the allocation values for this host are not the same as the
 *		last host, print the report of the last group of hosts that had
 *		identical allocation values.
 */
			if (strcmp(tmp1, tmp2) ||
			    (last_mem_alloc_ptr != job_resrcs->memory_allocated) ||
			    (job_resrcs->memory_allocated &&
			     (last_mem_alloc !=
			      job_resrcs->memory_allocated[rel_node_inx]))) {
				if (hostlist_count(hl_last)) {
					last_hosts = 
						hostlist_ranged_string_xmalloc(
						hl_last);
					snprintf(tmp_line, sizeof(tmp_line),
						 "  Nodes=%s CPU_IDs=%s Mem=%u",
						 last_hosts, tmp2,
						 last_mem_alloc_ptr ?
						 last_mem_alloc : 0);
					xfree(last_hosts);
					xstrcat(out, tmp_line);
					if (one_liner)
						xstrcat(out, " ");
					else
						xstrcat(out, "\n   ");

					hostlist_destroy(hl_last);
					hl_last = hostlist_create(NULL);
				}
				strcpy(tmp2, tmp1);
				last_mem_alloc_ptr = job_resrcs->memory_allocated;
				if (last_mem_alloc_ptr)
					last_mem_alloc = job_resrcs->
						memory_allocated[rel_node_inx];
				else
					last_mem_alloc = NO_VAL;
			}
			hostlist_push_host(hl_last, host);
			free(host);

			if (bit_inx > last)
				break;

			if (abs_node_inx > job_ptr->node_inx[i+1]) {
				i += 2;
				abs_node_inx = job_ptr->node_inx[i];
			} else {
				abs_node_inx++;
			}
		}

		if (hostlist_count(hl_last)) {
			last_hosts = hostlist_ranged_string_xmalloc(hl_last);
			snprintf(tmp_line, sizeof(tmp_line),
				 "  Nodes=%s CPU_IDs=%s Mem=%u",
				 last_hosts, tmp2,
				 last_mem_alloc_ptr ? last_mem_alloc : 0);
			xfree(last_hosts);
			xstrcat(out, tmp_line);
			if (one_liner)
				xstrcat(out, " ");
			else
				xstrcat(out, "\n   ");
		}
		hostlist_destroy(hl);
		hostlist_destroy(hl_last);
	}
	/****** Line 15 ******/
line15:
	if (job_ptr->pn_min_memory & MEM_PER_CPU) {
		job_ptr->pn_min_memory &= (~MEM_PER_CPU);
		tmp6_ptr = "CPU";
	} else
		tmp6_ptr = "Node";

	if (cluster_flags & CLUSTER_FLAG_BG) {
		convert_num_unit((float)job_ptr->pn_min_cpus,
				 tmp1, sizeof(tmp1), UNIT_NONE);
		snprintf(tmp_line, sizeof(tmp_line), "MinCPUsNode=%s",	tmp1);
	} else {
		snprintf(tmp_line, sizeof(tmp_line), "MinCPUsNode=%u",
			 job_ptr->pn_min_cpus);
	}

	xstrcat(out, tmp_line);
	convert_num_unit((float)job_ptr->pn_min_memory, tmp1, sizeof(tmp1),
			 UNIT_MEGA);
	convert_num_unit((float)job_ptr->pn_min_tmp_disk, tmp2, sizeof(tmp2),
			 UNIT_MEGA);
	snprintf(tmp_line, sizeof(tmp_line),
		 " MinMemory%s=%s MinTmpDiskNode=%s",
		 tmp6_ptr, tmp1, tmp2);
	xstrcat(out, tmp_line);
	if (one_liner)
		xstrcat(out, " ");
	else
		xstrcat(out, "\n   ");

	/****** Line 16 ******/
	snprintf(tmp_line, sizeof(tmp_line),
		 "Features=%s Gres=%s Reservation=%s",
		 job_ptr->features, job_ptr->gres, job_ptr->resv_name);
	xstrcat(out, tmp_line);
	if (one_liner)
		xstrcat(out, " ");
	else
		xstrcat(out, "\n   ");

	/****** Line 17 ******/
	snprintf(tmp_line, sizeof(tmp_line),
		 "Shared=%s Contiguous=%d Licenses=%s Network=%s",
		 (job_ptr->shared == 0 ? "0" :
		  job_ptr->shared == 1 ? "1" : "OK"),
		 job_ptr->contiguous, job_ptr->licenses, job_ptr->network);
	xstrcat(out, tmp_line);
	if (one_liner)
		xstrcat(out, " ");
	else
		xstrcat(out, "\n   ");

	/****** Line 18 ******/
	snprintf(tmp_line, sizeof(tmp_line), "Command=%s",
		 job_ptr->command);
	xstrcat(out, tmp_line);
	if (one_liner)
		xstrcat(out, " ");
	else
		xstrcat(out, "\n   ");

	/****** Line 19 ******/
	snprintf(tmp_line, sizeof(tmp_line), "WorkDir=%s",
		 job_ptr->work_dir);
	xstrcat(out, tmp_line);

	if (cluster_flags & CLUSTER_FLAG_BG) {
		/****** Line 20 (optional) ******/
		select_g_select_jobinfo_sprint(job_ptr->select_jobinfo,
					       select_buf, sizeof(select_buf),
					       SELECT_PRINT_BG_ID);
		if (select_buf[0] != '\0') {
			if (one_liner)
				xstrcat(out, " ");
			else
				xstrcat(out, "\n   ");
			snprintf(tmp_line, sizeof(tmp_line),
				 "Block_ID=%s", select_buf);
			xstrcat(out, tmp_line);
		}

		/****** Line 21 (optional) ******/
		select_g_select_jobinfo_sprint(job_ptr->select_jobinfo,
					       select_buf, sizeof(select_buf),
					       SELECT_PRINT_MIXED_SHORT);
		if (select_buf[0] != '\0') {
			if (one_liner)
				xstrcat(out, " ");
			else
				xstrcat(out, "\n   ");
			xstrcat(out, select_buf);
		}

		if (cluster_flags & CLUSTER_FLAG_BGL) {
			/****** Line 22 (optional) ******/
			select_g_select_jobinfo_sprint(
				job_ptr->select_jobinfo,
				select_buf, sizeof(select_buf),
				SELECT_PRINT_BLRTS_IMAGE);
			if (select_buf[0] != '\0') {
				if (one_liner)
					xstrcat(out, " ");
				else
					xstrcat(out, "\n   ");
				snprintf(tmp_line, sizeof(tmp_line),
					 "BlrtsImage=%s", select_buf);
				xstrcat(out, tmp_line);
			}
		}
		/****** Line 23 (optional) ******/
		select_g_select_jobinfo_sprint(job_ptr->select_jobinfo,
					       select_buf, sizeof(select_buf),
					       SELECT_PRINT_LINUX_IMAGE);
		if (select_buf[0] != '\0') {
			if (one_liner)
				xstrcat(out, " ");
			else
				xstrcat(out, "\n   ");
			if (cluster_flags & CLUSTER_FLAG_BGL)
				snprintf(tmp_line, sizeof(tmp_line),
					 "LinuxImage=%s", select_buf);
			else
				snprintf(tmp_line, sizeof(tmp_line),
					 "CnloadImage=%s", select_buf);

			xstrcat(out, tmp_line);
		}
		/****** Line 24 (optional) ******/
		select_g_select_jobinfo_sprint(job_ptr->select_jobinfo,
					       select_buf, sizeof(select_buf),
					       SELECT_PRINT_MLOADER_IMAGE);
		if (select_buf[0] != '\0') {
			if (one_liner)
				xstrcat(out, " ");
			else
				xstrcat(out, "\n   ");
			snprintf(tmp_line, sizeof(tmp_line),
				 "MloaderImage=%s", select_buf);
			xstrcat(out, tmp_line);
		}
		/****** Line 25 (optional) ******/
		select_g_select_jobinfo_sprint(job_ptr->select_jobinfo,
					       select_buf, sizeof(select_buf),
					       SELECT_PRINT_RAMDISK_IMAGE);
		if (select_buf[0] != '\0') {
			if (one_liner)
				xstrcat(out, " ");
			else
				xstrcat(out, "\n   ");
			if (cluster_flags & CLUSTER_FLAG_BGL)
				snprintf(tmp_line, sizeof(tmp_line),
					 "RamDiskImage=%s", select_buf);
			else
				snprintf(tmp_line, sizeof(tmp_line),
					 "IoloadImage=%s", select_buf);
			xstrcat(out, tmp_line);
		}
	}

	/****** Line 26 (optional) ******/
	if (job_ptr->comment) {
		if (one_liner)
			xstrcat(out, " ");
		else
			xstrcat(out, "\n   ");
		snprintf(tmp_line, sizeof(tmp_line), "Comment=%s ",
			 job_ptr->comment);
		xstrcat(out, tmp_line);
	}

	/****** Line 27 (optional) ******/
	if (job_ptr->batch_script) {
		if (one_liner)
			xstrcat(out, " ");
		else
			xstrcat(out, "\n   ");
		xstrcat(out, "BatchScript=\n");
		xstrcat(out, job_ptr->batch_script);
	}

	/****** Line 28 (optional) ******/
	if (job_ptr->req_switch) {
		char time_buf[32];
		if (one_liner)
			xstrcat(out, " ");
		else
			xstrcat(out, "\n   ");
		secs2time_str((time_t) job_ptr->wait4switch, time_buf,
			      sizeof(time_buf));
		snprintf(tmp_line, sizeof(tmp_line), "Switches=%u@%s\n",
			 job_ptr->req_switch, time_buf);
		xstrcat(out, tmp_line);
	}

	/****** Line 29 (optional) ******/
	if (one_liner)
		xstrcat(out, "\n");
	else
		xstrcat(out, "\n\n");

	return out;

}
예제 #7
0
static slurmdb_job_rec_t *_slurmdb_create_job_rec(
	filetxt_job_rec_t *filetxt_job, slurmdb_job_cond_t *job_cond)
{
	slurmdb_job_rec_t *slurmdb_job = NULL;
	ListIterator itr = NULL;
	filetxt_step_rec_t *filetxt_step = NULL;

	if (!job_cond)
		goto no_cond;

	if (job_cond->state_list
	    && list_count(job_cond->state_list)) {
		char *object = NULL;
		itr = list_iterator_create(job_cond->state_list);
		while((object = list_next(itr))) {
			if (atoi(object) == filetxt_job->status) {
				list_iterator_destroy(itr);
				goto foundstate;
			}
		}
		list_iterator_destroy(itr);
		return NULL;	/* no match */
	}

foundstate:

no_cond:
	slurmdb_job = slurmdb_create_job_rec();
	slurmdb_job->associd = 0;
	slurmdb_job->account = xstrdup(filetxt_job->account);
	slurmdb_job->blockid = xstrdup(filetxt_job->header.blockid);
	slurmdb_job->cluster = NULL;
	slurmdb_job->elapsed = filetxt_job->elapsed;
	slurmdb_job->eligible = filetxt_job->header.job_submit;
	slurmdb_job->end = filetxt_job->header.timestamp;
	slurmdb_job->exitcode = filetxt_job->exitcode;
	slurmdb_job->gid = filetxt_job->header.gid;
	slurmdb_job->jobid = filetxt_job->header.jobnum;
	slurmdb_job->jobname = xstrdup(filetxt_job->jobname);
	slurmdb_job->partition = xstrdup(filetxt_job->header.partition);
	slurmdb_job->req_cpus = filetxt_job->ncpus;
	slurmdb_job->alloc_cpus = filetxt_job->ncpus;
	if (filetxt_job->nodes) {
		hostlist_t hl = hostlist_create(filetxt_job->nodes);
		slurmdb_job->alloc_nodes = hostlist_count(hl);
		hostlist_destroy(hl);
	}
	slurmdb_job->nodes = xstrdup(filetxt_job->nodes);
	slurmdb_job->priority = filetxt_job->priority;
	slurmdb_job->requid = filetxt_job->requid;
	memcpy(&slurmdb_job->stats, &filetxt_job->stats,
	       sizeof(slurmdb_stats_t));
	slurmdb_job->show_full = filetxt_job->show_full;
	slurmdb_job->start = filetxt_job->header.timestamp -
		slurmdb_job->elapsed;
	slurmdb_job->state = filetxt_job->status;

	slurmdb_job->steps = list_create(slurmdb_destroy_step_rec);
	if (filetxt_job->steps) {
		itr = list_iterator_create(filetxt_job->steps);
		while((filetxt_step = list_next(itr))) {
			slurmdb_step_rec_t *step =
				_slurmdb_create_step_rec(filetxt_step);
			if (step) {
				step->job_ptr = slurmdb_job;
				if (!slurmdb_job->first_step_ptr)
					slurmdb_job->first_step_ptr = step;
				list_append(slurmdb_job->steps, step);
			}
		}
		list_iterator_destroy(itr);
	}
	slurmdb_job->submit = filetxt_job->header.job_submit;

	slurmdb_job->sys_cpu_sec = filetxt_job->rusage.ru_stime.tv_sec;
	slurmdb_job->sys_cpu_usec = filetxt_job->rusage.ru_stime.tv_usec;
	slurmdb_job->tot_cpu_sec = filetxt_job->tot_cpu_sec;
	slurmdb_job->tot_cpu_usec = filetxt_job->tot_cpu_usec;
	slurmdb_job->track_steps = filetxt_job->track_steps;
	slurmdb_job->uid = filetxt_job->header.uid;
	slurmdb_job->user = NULL;
	slurmdb_job->user_cpu_sec = filetxt_job->rusage.ru_utime.tv_sec;
	slurmdb_job->user_cpu_usec = filetxt_job->rusage.ru_utime.tv_usec;

	return slurmdb_job;
}
예제 #8
0
파일: job_info.c 프로젝트: fafik23/slurm
/*
 * slurm_sprint_job_info - output information about a specific Slurm
 *	job based upon message as loaded using slurm_load_jobs
 * IN job_ptr - an individual job information record pointer
 * IN one_liner - print as a single line if true
 * RET out - char * containing formatted output (must be freed after call)
 *           NULL is returned on failure.
 */
extern char *
slurm_sprint_job_info ( job_info_t * job_ptr, int one_liner )
{
	int i, j, k;
	char time_str[32], *group_name, *user_name;
	char *gres_last = "", tmp1[128], tmp2[128];
	char *tmp6_ptr;
	char tmp_line[1024 * 128];
	char tmp_path[MAXPATHLEN];
	char *ionodes = NULL;
	uint16_t exit_status = 0, term_sig = 0;
	job_resources_t *job_resrcs = job_ptr->job_resrcs;
	char *out = NULL;
	time_t run_time;
	uint32_t min_nodes, max_nodes = 0;
	char *nodelist = "NodeList";
	bitstr_t *cpu_bitmap;
	char *host;
	int sock_inx, sock_reps, last;
	int abs_node_inx, rel_node_inx;
	int64_t nice;
	int bit_inx, bit_reps;
	uint64_t *last_mem_alloc_ptr = NULL;
	uint64_t last_mem_alloc = NO_VAL64;
	char *last_hosts;
	hostlist_t hl, hl_last;
	char select_buf[122];
	uint32_t cluster_flags = slurmdb_setup_cluster_flags();
	uint32_t threads;
	char *line_end = (one_liner) ? " " : "\n   ";

	if (cluster_flags & CLUSTER_FLAG_BG) {
		nodelist = "MidplaneList";
		select_g_select_jobinfo_get(job_ptr->select_jobinfo,
					    SELECT_JOBDATA_IONODES,
					    &ionodes);
	}

	/****** Line 1 ******/
	xstrfmtcat(out, "JobId=%u ", job_ptr->job_id);

	if (job_ptr->array_job_id) {
		if (job_ptr->array_task_str) {
			xstrfmtcat(out, "ArrayJobId=%u ArrayTaskId=%s ",
				   job_ptr->array_job_id,
				   job_ptr->array_task_str);
		} else {
			xstrfmtcat(out, "ArrayJobId=%u ArrayTaskId=%u ",
				   job_ptr->array_job_id,
				   job_ptr->array_task_id);
		}
	}
	xstrfmtcat(out, "JobName=%s", job_ptr->name);
	xstrcat(out, line_end);

	/****** Line 2 ******/
	user_name = uid_to_string((uid_t) job_ptr->user_id);
	group_name = gid_to_string((gid_t) job_ptr->group_id);
	xstrfmtcat(out, "UserId=%s(%u) GroupId=%s(%u) MCS_label=%s",
		   user_name, job_ptr->user_id, group_name, job_ptr->group_id,
		   (job_ptr->mcs_label==NULL) ? "N/A" : job_ptr->mcs_label);
	xfree(user_name);
	xfree(group_name);
	xstrcat(out, line_end);

	/****** Line 3 ******/
	nice = ((int64_t)job_ptr->nice) - NICE_OFFSET;
	xstrfmtcat(out, "Priority=%u Nice=%"PRIi64" Account=%s QOS=%s",
		   job_ptr->priority, nice, job_ptr->account, job_ptr->qos);
	if (slurm_get_track_wckey())
		xstrfmtcat(out, " WCKey=%s", job_ptr->wckey);
	xstrcat(out, line_end);

	/****** Line 4 ******/
	xstrfmtcat(out, "JobState=%s ", job_state_string(job_ptr->job_state));

	if (job_ptr->state_desc) {
		/* Replace white space with underscore for easier parsing */
		for (j=0; job_ptr->state_desc[j]; j++) {
			if (isspace((int)job_ptr->state_desc[j]))
				job_ptr->state_desc[j] = '_';
		}
		xstrfmtcat(out, "Reason=%s ", job_ptr->state_desc);
	} else
		xstrfmtcat(out, "Reason=%s ", job_reason_string(job_ptr->state_reason));

	xstrfmtcat(out, "Dependency=%s", job_ptr->dependency);
	xstrcat(out, line_end);

	/****** Line 5 ******/
	xstrfmtcat(out, "Requeue=%u Restarts=%u BatchFlag=%u Reboot=%u ",
		 job_ptr->requeue, job_ptr->restart_cnt, job_ptr->batch_flag,
		 job_ptr->reboot);
	if (WIFSIGNALED(job_ptr->exit_code))
		term_sig = WTERMSIG(job_ptr->exit_code);
	exit_status = WEXITSTATUS(job_ptr->exit_code);
	xstrfmtcat(out, "ExitCode=%u:%u", exit_status, term_sig);
	xstrcat(out, line_end);

	/****** Line 5a (optional) ******/
	if (job_ptr->show_flags & SHOW_DETAIL) {
		if (WIFSIGNALED(job_ptr->derived_ec))
			term_sig = WTERMSIG(job_ptr->derived_ec);
		else
			term_sig = 0;
		exit_status = WEXITSTATUS(job_ptr->derived_ec);
		xstrfmtcat(out, "DerivedExitCode=%u:%u", exit_status, term_sig);
		xstrcat(out, line_end);
	}

	/****** Line 6 ******/
	if (IS_JOB_PENDING(job_ptr))
		run_time = 0;
	else if (IS_JOB_SUSPENDED(job_ptr))
		run_time = job_ptr->pre_sus_time;
	else {
		time_t end_time;
		if (IS_JOB_RUNNING(job_ptr) || (job_ptr->end_time == 0))
			end_time = time(NULL);
		else
			end_time = job_ptr->end_time;
		if (job_ptr->suspend_time) {
			run_time = (time_t)
				(difftime(end_time, job_ptr->suspend_time)
				 + job_ptr->pre_sus_time);
		} else
			run_time = (time_t)
				difftime(end_time, job_ptr->start_time);
	}
	secs2time_str(run_time, time_str, sizeof(time_str));
	xstrfmtcat(out, "RunTime=%s ", time_str);

	if (job_ptr->time_limit == NO_VAL)
		xstrcat(out, "TimeLimit=Partition_Limit ");
	else {
		mins2time_str(job_ptr->time_limit, time_str, sizeof(time_str));
		xstrfmtcat(out, "TimeLimit=%s ", time_str);
	}

	if (job_ptr->time_min == 0)
		xstrcat(out, "TimeMin=N/A");
	else {
		mins2time_str(job_ptr->time_min, time_str, sizeof(time_str));
		xstrfmtcat(out, "TimeMin=%s", time_str);
	}
	xstrcat(out, line_end);

	/****** Line 7 ******/
	slurm_make_time_str(&job_ptr->submit_time, time_str, sizeof(time_str));
	xstrfmtcat(out, "SubmitTime=%s ", time_str);

	slurm_make_time_str(&job_ptr->eligible_time, time_str, sizeof(time_str));
	xstrfmtcat(out, "EligibleTime=%s", time_str);

	xstrcat(out, line_end);

	/****** Line 8 (optional) ******/
	if (job_ptr->resize_time) {
		slurm_make_time_str(&job_ptr->resize_time, time_str, sizeof(time_str));
		xstrfmtcat(out, "ResizeTime=%s", time_str);
		xstrcat(out, line_end);
	}

	/****** Line 9 ******/
	slurm_make_time_str(&job_ptr->start_time, time_str, sizeof(time_str));
	xstrfmtcat(out, "StartTime=%s ", time_str);

	if ((job_ptr->time_limit == INFINITE) &&
	    (job_ptr->end_time > time(NULL)))
		xstrcat(out, "EndTime=Unknown ");
	else {
		slurm_make_time_str(&job_ptr->end_time, time_str, sizeof(time_str));
		xstrfmtcat(out, "EndTime=%s ", time_str);
	}

	if (job_ptr->deadline) {
		slurm_make_time_str(&job_ptr->deadline, time_str, sizeof(time_str));
		xstrfmtcat(out, "Deadline=%s", time_str);
	} else {
		xstrcat(out, "Deadline=N/A");
	}

	xstrcat(out, line_end);

	/****** Line 10 ******/
	if (job_ptr->preempt_time == 0)
		xstrcat(out, "PreemptTime=None ");
	else {
		slurm_make_time_str(&job_ptr->preempt_time, time_str, sizeof(time_str));
		xstrfmtcat(out, "PreemptTime=%s ", time_str);
	}

	if (job_ptr->suspend_time) {
		slurm_make_time_str(&job_ptr->suspend_time, time_str, sizeof(time_str));
		xstrfmtcat(out, "SuspendTime=%s ", time_str);
	} else
		xstrcat(out, "SuspendTime=None ");

	xstrfmtcat(out, "SecsPreSuspend=%ld", (long int)job_ptr->pre_sus_time);
	xstrcat(out, line_end);

	/****** Line 11 ******/
	xstrfmtcat(out, "Partition=%s AllocNode:Sid=%s:%u",
		   job_ptr->partition, job_ptr->alloc_node, job_ptr->alloc_sid);
	xstrcat(out, line_end);

	/****** Line 12 ******/
	xstrfmtcat(out, "Req%s=%s Exc%s=%s", nodelist, job_ptr->req_nodes,
		   nodelist, job_ptr->exc_nodes);
	xstrcat(out, line_end);

	/****** Line 13 ******/
	xstrfmtcat(out, "%s=%s", nodelist, job_ptr->nodes);
	if (job_ptr->nodes && ionodes) {
		xstrfmtcat(out, "[%s]", ionodes);
		xfree(ionodes);
	}
	if (job_ptr->sched_nodes)
		xstrfmtcat(out, " Sched%s=%s", nodelist, job_ptr->sched_nodes);

	xstrcat(out, line_end);

	/****** Line 14 (optional) ******/
	if (job_ptr->batch_host) {
		xstrfmtcat(out, "BatchHost=%s", job_ptr->batch_host);
		xstrcat(out, line_end);
	}

	/****** Line 14a (optional) ******/
	if (job_ptr->fed_siblings) {
		xstrfmtcat(out, "FedOrigin=%s FedSiblings=%s",
			   job_ptr->fed_origin_str, job_ptr->fed_siblings_str);
		xstrcat(out, line_end);
	}

	/****** Line 15 ******/
	if (cluster_flags & CLUSTER_FLAG_BG) {
		select_g_select_jobinfo_get(job_ptr->select_jobinfo,
					    SELECT_JOBDATA_NODE_CNT,
					    &min_nodes);
		if ((min_nodes == 0) || (min_nodes == NO_VAL)) {
			min_nodes = job_ptr->num_nodes;
			max_nodes = job_ptr->max_nodes;
		} else if (job_ptr->max_nodes)
			max_nodes = min_nodes;
	} else if (IS_JOB_PENDING(job_ptr)) {
		min_nodes = job_ptr->num_nodes;
		max_nodes = job_ptr->max_nodes;
		if (max_nodes && (max_nodes < min_nodes))
			min_nodes = max_nodes;
	} else {
		min_nodes = job_ptr->num_nodes;
		max_nodes = 0;
	}

	_sprint_range(tmp_line, sizeof(tmp_line), min_nodes, max_nodes);
	xstrfmtcat(out, "NumNodes=%s ", tmp_line);
	_sprint_range(tmp_line, sizeof(tmp_line), job_ptr->num_cpus, job_ptr->max_cpus);
	xstrfmtcat(out, "NumCPUs=%s ", tmp_line);

	xstrfmtcat(out, "NumTasks=%u ", job_ptr->num_tasks);
	xstrfmtcat(out, "CPUs/Task=%u ", job_ptr->cpus_per_task);

	if (job_ptr->boards_per_node == (uint16_t) NO_VAL)
		xstrcat(out, "ReqB:S:C:T=*:");
	else
		xstrfmtcat(out, "ReqB:S:C:T=%u:", job_ptr->boards_per_node);

	if (job_ptr->sockets_per_board == (uint16_t) NO_VAL)
		xstrcat(out, "*:");
	else
		xstrfmtcat(out, "%u:", job_ptr->sockets_per_board);

	if (job_ptr->cores_per_socket == (uint16_t) NO_VAL)
		xstrcat(out, "*:");
	else
		xstrfmtcat(out, "%u:", job_ptr->cores_per_socket);

	if (job_ptr->threads_per_core == (uint16_t) NO_VAL)
		xstrcat(out, "*");
	else
		xstrfmtcat(out, "%u", job_ptr->threads_per_core);

	xstrcat(out, line_end);

	/****** Line 16 ******/
	/* Tres should already of been converted at this point from simple */
	xstrfmtcat(out, "TRES=%s",
		   job_ptr->tres_alloc_str ? job_ptr->tres_alloc_str
					   : job_ptr->tres_req_str);
	xstrcat(out, line_end);

	/****** Line 17 ******/
	if (job_ptr->sockets_per_node == (uint16_t) NO_VAL)
		xstrcat(out, "Socks/Node=* ");
	else
		xstrfmtcat(out, "Socks/Node=%u ", job_ptr->sockets_per_node);

	if (job_ptr->ntasks_per_node == (uint16_t) NO_VAL)
		xstrcat(out, "NtasksPerN:B:S:C=*:");
	else
		xstrfmtcat(out, "NtasksPerN:B:S:C=%u:", job_ptr->ntasks_per_node);

	if (job_ptr->ntasks_per_board == (uint16_t) NO_VAL)
		xstrcat(out, "*:");
	else
		xstrfmtcat(out, "%u:", job_ptr->ntasks_per_board);

	if ((job_ptr->ntasks_per_socket == (uint16_t) NO_VAL) ||
	    (job_ptr->ntasks_per_socket == (uint16_t) INFINITE))
		xstrcat(out, "*:");
	else
		xstrfmtcat(out, "%u:", job_ptr->ntasks_per_socket);

	if ((job_ptr->ntasks_per_core == (uint16_t) NO_VAL) ||
	    (job_ptr->ntasks_per_core == (uint16_t) INFINITE))
		xstrcat(out, "* ");
	else
		xstrfmtcat(out, "%u ", job_ptr->ntasks_per_core);

	if (job_ptr->core_spec == (uint16_t) NO_VAL)
		xstrcat(out, "CoreSpec=*");
	else if (job_ptr->core_spec & CORE_SPEC_THREAD)
		xstrfmtcat(out, "ThreadSpec=%d",
			   (job_ptr->core_spec & (~CORE_SPEC_THREAD)));
	else
		xstrfmtcat(out, "CoreSpec=%u", job_ptr->core_spec);

	xstrcat(out, line_end);

	if (job_resrcs && cluster_flags & CLUSTER_FLAG_BG) {
		if ((job_resrcs->cpu_array_cnt > 0) &&
		    (job_resrcs->cpu_array_value) &&
		    (job_resrcs->cpu_array_reps)) {
			int length = 0;
			xstrcat(out, "CPUs=");
			for (i = 0; i < job_resrcs->cpu_array_cnt; i++) {
				/* only print 60 characters worth of this record */
				if (length > 60) {
					/* skip to last CPU group entry */
					if (i < job_resrcs->cpu_array_cnt - 1) {
						continue;
					}
					/* add ellipsis before last entry */
					xstrcat(out, "...,");
				}

				length += xstrfmtcat(out, "%d", job_resrcs->cpus[i]);
				if (job_resrcs->cpu_array_reps[i] > 1) {
					length += xstrfmtcat(out, "*%d",
							     job_resrcs->cpu_array_reps[i]);
				}
				if (i < job_resrcs->cpu_array_cnt - 1) {
					xstrcat(out, ",");
					length++;
				}
			}
			xstrcat(out, line_end);
		}
	} else if (job_resrcs && job_resrcs->core_bitmap &&
		   ((last = bit_fls(job_resrcs->core_bitmap)) != -1)) {
		hl = hostlist_create(job_resrcs->nodes);
		if (!hl) {
			error("slurm_sprint_job_info: hostlist_create: %s",
			      job_resrcs->nodes);
			return NULL;
		}
		hl_last = hostlist_create(NULL);
		if (!hl_last) {
			error("slurm_sprint_job_info: hostlist_create: NULL");
			hostlist_destroy(hl);
			return NULL;
		}

		bit_inx = 0;
		i = sock_inx = sock_reps = 0;
		abs_node_inx = job_ptr->node_inx[i];

		gres_last = "";
		/* tmp1[] stores the current cpu(s) allocated */
		tmp2[0] = '\0';	/* stores last cpu(s) allocated */
		for (rel_node_inx=0; rel_node_inx < job_resrcs->nhosts;
		     rel_node_inx++) {

			if (sock_reps >=
			    job_resrcs->sock_core_rep_count[sock_inx]) {
				sock_inx++;
				sock_reps = 0;
			}
			sock_reps++;

			bit_reps = job_resrcs->sockets_per_node[sock_inx] *
				   job_resrcs->cores_per_socket[sock_inx];
			host = hostlist_shift(hl);
			threads = _threads_per_core(host);
			cpu_bitmap = bit_alloc(bit_reps * threads);
			for (j = 0; j < bit_reps; j++) {
				if (bit_test(job_resrcs->core_bitmap, bit_inx)){
					for (k = 0; k < threads; k++)
						bit_set(cpu_bitmap,
							(j * threads) + k);
				}
				bit_inx++;
			}
			bit_fmt(tmp1, sizeof(tmp1), cpu_bitmap);
			FREE_NULL_BITMAP(cpu_bitmap);
			/*
			 * If the allocation values for this host are not the
			 * same as the last host, print the report of the last
			 * group of hosts that had identical allocation values.
			 */
			if (xstrcmp(tmp1, tmp2) ||
			    ((rel_node_inx < job_ptr->gres_detail_cnt) &&
			     xstrcmp(job_ptr->gres_detail_str[rel_node_inx],
				     gres_last)) ||
			    (last_mem_alloc_ptr != job_resrcs->memory_allocated) ||
			    (job_resrcs->memory_allocated &&
			     (last_mem_alloc !=
			      job_resrcs->memory_allocated[rel_node_inx]))) {
				if (hostlist_count(hl_last)) {
					last_hosts =
						hostlist_ranged_string_xmalloc(
						hl_last);
					xstrfmtcat(out,
						   "  Nodes=%s CPU_IDs=%s "
						   "Mem=%"PRIu64" GRES_IDX=%s",
						   last_hosts, tmp2,
						   last_mem_alloc_ptr ?
						   last_mem_alloc : 0,
						    gres_last);
					xfree(last_hosts);
					xstrcat(out, line_end);

					hostlist_destroy(hl_last);
					hl_last = hostlist_create(NULL);
				}
				strcpy(tmp2, tmp1);
				if (rel_node_inx < job_ptr->gres_detail_cnt) {
					gres_last = job_ptr->
						    gres_detail_str[rel_node_inx];
				} else {
					gres_last = "";
				}
				last_mem_alloc_ptr = job_resrcs->memory_allocated;
				if (last_mem_alloc_ptr)
					last_mem_alloc = job_resrcs->
						memory_allocated[rel_node_inx];
				else
					last_mem_alloc = NO_VAL64;
			}
			hostlist_push_host(hl_last, host);
			free(host);

			if (bit_inx > last)
				break;

			if (abs_node_inx > job_ptr->node_inx[i+1]) {
				i += 2;
				abs_node_inx = job_ptr->node_inx[i];
			} else {
				abs_node_inx++;
			}
		}

		if (hostlist_count(hl_last)) {
			last_hosts = hostlist_ranged_string_xmalloc(hl_last);
			xstrfmtcat(out, "  Nodes=%s CPU_IDs=%s Mem=%"PRIu64" GRES_IDX=%s",
				 last_hosts, tmp2,
				 last_mem_alloc_ptr ? last_mem_alloc : 0,
				 gres_last);
			xfree(last_hosts);
			xstrcat(out, line_end);
		}
		hostlist_destroy(hl);
		hostlist_destroy(hl_last);
	}
	/****** Line 18 ******/
	if (job_ptr->pn_min_memory & MEM_PER_CPU) {
		job_ptr->pn_min_memory &= (~MEM_PER_CPU);
		tmp6_ptr = "CPU";
	} else
		tmp6_ptr = "Node";

	if (cluster_flags & CLUSTER_FLAG_BG) {
		convert_num_unit((float)job_ptr->pn_min_cpus, tmp1,
				 sizeof(tmp1), UNIT_NONE, NO_VAL,
				 CONVERT_NUM_UNIT_EXACT);
		xstrfmtcat(out, "MinCPUsNode=%s ", tmp1);
	} else {
		xstrfmtcat(out, "MinCPUsNode=%u ", job_ptr->pn_min_cpus);
	}

	convert_num_unit((float)job_ptr->pn_min_memory, tmp1, sizeof(tmp1),
			 UNIT_MEGA, NO_VAL, CONVERT_NUM_UNIT_EXACT);
	convert_num_unit((float)job_ptr->pn_min_tmp_disk, tmp2, sizeof(tmp2),
			 UNIT_MEGA, NO_VAL, CONVERT_NUM_UNIT_EXACT);
	xstrfmtcat(out, "MinMemory%s=%s MinTmpDiskNode=%s", tmp6_ptr, tmp1, tmp2);
	xstrcat(out, line_end);

	/****** Line ******/
	secs2time_str((time_t)job_ptr->delay_boot, tmp1, sizeof(tmp1));
	xstrfmtcat(out, "Features=%s DelayBoot=%s", job_ptr->features, tmp1);
	xstrcat(out, line_end);

	/****** Line ******/
	xstrfmtcat(out, "Gres=%s Reservation=%s",
		   job_ptr->gres, job_ptr->resv_name);
	xstrcat(out, line_end);

	/****** Line 20 ******/
	xstrfmtcat(out, "OverSubscribe=%s Contiguous=%d Licenses=%s Network=%s",
		   job_share_string(job_ptr->shared), job_ptr->contiguous,
		   job_ptr->licenses, job_ptr->network);
	xstrcat(out, line_end);

	/****** Line 21 ******/
	xstrfmtcat(out, "Command=%s", job_ptr->command);
	xstrcat(out, line_end);

	/****** Line 22 ******/
	xstrfmtcat(out, "WorkDir=%s", job_ptr->work_dir);

	if (cluster_flags & CLUSTER_FLAG_BG) {
		/****** Line 23 (optional) ******/
		select_g_select_jobinfo_sprint(job_ptr->select_jobinfo,
					       select_buf, sizeof(select_buf),
					       SELECT_PRINT_BG_ID);
		if (select_buf[0] != '\0') {
			xstrcat(out, line_end);
			xstrfmtcat(out, "Block_ID=%s", select_buf);
		}

		/****** Line 24 (optional) ******/
		select_g_select_jobinfo_sprint(job_ptr->select_jobinfo,
					       select_buf, sizeof(select_buf),
					       SELECT_PRINT_MIXED_SHORT);
		if (select_buf[0] != '\0') {
			xstrcat(out, line_end);
			xstrcat(out, select_buf);
		}

		/****** Line 26 (optional) ******/
		select_g_select_jobinfo_sprint(job_ptr->select_jobinfo,
					       select_buf, sizeof(select_buf),
					       SELECT_PRINT_LINUX_IMAGE);
		if (select_buf[0] != '\0') {
			xstrcat(out, line_end);
			xstrfmtcat(out, "CnloadImage=%s", select_buf);
		}
		/****** Line 27 (optional) ******/
		select_g_select_jobinfo_sprint(job_ptr->select_jobinfo,
					       select_buf, sizeof(select_buf),
					       SELECT_PRINT_MLOADER_IMAGE);
		if (select_buf[0] != '\0') {
			xstrcat(out, line_end);
			xstrfmtcat(out, "MloaderImage=%s", select_buf);
		}
		/****** Line 28 (optional) ******/
		select_g_select_jobinfo_sprint(job_ptr->select_jobinfo,
					       select_buf, sizeof(select_buf),
					       SELECT_PRINT_RAMDISK_IMAGE);
		if (select_buf[0] != '\0') {
			xstrcat(out, line_end);
			xstrfmtcat(out, "IoloadImage=%s", select_buf);
		}
	}

	/****** Line (optional) ******/
	if (job_ptr->admin_comment) {
		xstrcat(out, line_end);
		xstrfmtcat(out, "AdminComment=%s ", job_ptr->admin_comment);
	}

	/****** Line (optional) ******/
	if (job_ptr->comment) {
		xstrcat(out, line_end);
		xstrfmtcat(out, "Comment=%s ", job_ptr->comment);
	}

	/****** Line 30 (optional) ******/
	if (job_ptr->batch_flag) {
		xstrcat(out, line_end);
		slurm_get_job_stderr(tmp_path, sizeof(tmp_path), job_ptr);
		xstrfmtcat(out, "StdErr=%s", tmp_path);
	}

	/****** Line 31 (optional) ******/
	if (job_ptr->batch_flag) {
		xstrcat(out, line_end);
		slurm_get_job_stdin(tmp_path, sizeof(tmp_path), job_ptr);
		xstrfmtcat(out, "StdIn=%s", tmp_path);
	}

	/****** Line 32 (optional) ******/
	if (job_ptr->batch_flag) {
		xstrcat(out, line_end);
		slurm_get_job_stdout(tmp_path, sizeof(tmp_path), job_ptr);
		xstrfmtcat(out, "StdOut=%s", tmp_path);
	}

	/****** Line 33 (optional) ******/
	if (job_ptr->batch_script) {
		xstrcat(out, line_end);
		xstrcat(out, "BatchScript=\n");
		xstrcat(out, job_ptr->batch_script);
	}

	/****** Line 34 (optional) ******/
	if (job_ptr->req_switch) {
		char time_buf[32];
		xstrcat(out, line_end);
		secs2time_str((time_t) job_ptr->wait4switch, time_buf,
			      sizeof(time_buf));
		xstrfmtcat(out, "Switches=%u@%s\n", job_ptr->req_switch, time_buf);
	}

	/****** Line 35 (optional) ******/
	if (job_ptr->burst_buffer) {
		xstrcat(out, line_end);
		xstrfmtcat(out, "BurstBuffer=%s", job_ptr->burst_buffer);
	}

	/****** Line (optional) ******/
	if (job_ptr->burst_buffer_state) {
		xstrcat(out, line_end);
		xstrfmtcat(out, "BurstBufferState=%s",
			   job_ptr->burst_buffer_state);
	}

	/****** Line 36 (optional) ******/
	if (cpu_freq_debug(NULL, NULL, tmp1, sizeof(tmp1),
			   job_ptr->cpu_freq_gov, job_ptr->cpu_freq_min,
			   job_ptr->cpu_freq_max, NO_VAL) != 0) {
		xstrcat(out, line_end);
		xstrcat(out, tmp1);
	}

	/****** Line 37 ******/
	xstrcat(out, line_end);
	xstrfmtcat(out, "Power=%s", power_flags_str(job_ptr->power_flags));

	/****** Line 38 (optional) ******/
	if (job_ptr->bitflags) {
		xstrcat(out, line_end);
		if (job_ptr->bitflags & GRES_ENFORCE_BIND)
			xstrcat(out, "GresEnforceBind=Yes");
		if (job_ptr->bitflags & KILL_INV_DEP)
			xstrcat(out, "KillOInInvalidDependent=Yes");
		if (job_ptr->bitflags & NO_KILL_INV_DEP)
			xstrcat(out, "KillOInInvalidDependent=No");
		if (job_ptr->bitflags & SPREAD_JOB)
			xstrcat(out, "SpreadJob=Yes");
	}

	/****** END OF JOB RECORD ******/
	if (one_liner)
		xstrcat(out, "\n");
	else
		xstrcat(out, "\n\n");

	return out;
}
예제 #9
0
파일: opts.c 프로젝트: cread/slurm
/*
 * parse_command_line, fill in params data structure with data
 */
extern void parse_command_line(int argc, char **argv)
{
	char *env_val = NULL;
	int opt_char;
	int option_index;
	hostlist_t host_list;
	bool long_form = false;
	bool opt_a_set = false, opt_p_set = false;
	bool env_a_set = false, env_p_set = false;
	static struct option long_options[] = {
		{"all",       no_argument,       0, 'a'},
		{"bg",        no_argument,       0, 'b'},
		{"dead",      no_argument,       0, 'd'},
		{"exact",     no_argument,       0, 'e'},
		{"federation",no_argument,       0, OPT_LONG_FEDR},
		{"help",      no_argument,       0, OPT_LONG_HELP},
		{"hide",      no_argument,       0, OPT_LONG_HIDE},
		{"iterate",   required_argument, 0, 'i'},
		{"local",     no_argument,       0, OPT_LONG_LOCAL},
		{"long",      no_argument,       0, 'l'},
		{"cluster",   required_argument, 0, 'M'},
		{"clusters",  required_argument, 0, 'M'},
		{"nodes",     required_argument, 0, 'n'},
		{"noconvert", no_argument,       0, OPT_LONG_NOCONVERT},
		{"noheader",  no_argument,       0, 'h'},
		{"Node",      no_argument,       0, 'N'},
		{"format",    required_argument, 0, 'o'},
		{"Format",    required_argument, 0, 'O'},
		{"partition", required_argument, 0, 'p'},
		{"responding",no_argument,       0, 'r'},
		{"list-reasons", no_argument,    0, 'R'},
		{"summarize", no_argument,       0, 's'},
		{"sort",      required_argument, 0, 'S'},
		{"states",    required_argument, 0, 't'},
		{"reservation",no_argument,      0, 'T'},
		{"usage",     no_argument,       0, OPT_LONG_USAGE},
		{"verbose",   no_argument,       0, 'v'},
		{"version",   no_argument,       0, 'V'},
		{NULL,        0,                 0, 0}
	};

	params.convert_flags = CONVERT_NUM_UNIT_EXACT;

	if (slurmctld_conf.fed_params &&
	    strstr(slurmctld_conf.fed_params, "fed_display"))
		params.federation_flag = true;

	if (getenv("SINFO_ALL")) {
		env_a_set = true;
		params.all_flag = true;
	}
	if (getenv("SINFO_FEDERATION"))
		params.federation_flag = true;
	if (getenv("SINFO_LOCAL"))
		params.local = true;
	if ( ( env_val = getenv("SINFO_PARTITION") ) ) {
		env_p_set = true;
		params.partition = xstrdup(env_val);
		params.part_list = _build_part_list(env_val);
		params.all_flag = true;
	}
	if (env_a_set && env_p_set) {
		error("Conflicting options, SINFO_ALL and SINFO_PARTITION, specified. "
		      "Please choose one or the other.");
		exit(1);
	}
	if ( ( env_val = getenv("SINFO_SORT") ) )
		params.sort = xstrdup(env_val);
	if ( ( env_val = getenv("SLURM_CLUSTERS") ) ) {
		if (!(params.clusters = slurmdb_get_info_cluster(env_val))) {
			print_db_notok(env_val, 1);
			exit(1);
		}
		working_cluster_rec = list_peek(params.clusters);
		params.local = true;
	}

	while ((opt_char = getopt_long(argc, argv,
				       "abdehi:lM:n:No:O:p:rRsS:t:TvV",
				       long_options, &option_index)) != -1) {
		switch (opt_char) {
		case (int)'?':
			fprintf(stderr,
				"Try \"sinfo --help\" for more information\n");
			exit(1);
			break;
		case (int)'a':
			opt_a_set = true;
			xfree(params.partition);
			FREE_NULL_LIST(params.part_list);
			params.all_flag = true;
			break;
		case (int)'b':
			params.cluster_flags = slurmdb_setup_cluster_flags();
			if (params.cluster_flags & CLUSTER_FLAG_BG)
				params.bg_flag = true;
			else {
				error("Must be on a BG system to use --bg "
				      "option, if using --cluster option "
				      "put the --bg option "
				      "after the --cluster option.");
				exit(1);
			}
			break;
		case (int)'d':
			params.dead_nodes = true;
			break;
		case (int)'e':
			params.exact_match = true;
			break;
		case (int)'h':
			params.no_header = true;
			break;
		case (int) 'i':
			params.iterate= atoi(optarg);
			if (params.iterate <= 0) {
				error ("Error: invalid entry for "
				       "--iterate=%s", optarg);
				exit(1);
			}
			break;
		case (int) 'l':
			params.long_output = true;
			break;
		case (int) 'M':
			FREE_NULL_LIST(params.clusters);
			if (!(params.clusters =
			      slurmdb_get_info_cluster(optarg))) {
				print_db_notok(optarg, 0);
				exit(1);
			}
			working_cluster_rec = list_peek(params.clusters);
			params.local = true;
			break;
		case OPT_LONG_NOCONVERT:
			params.convert_flags |= CONVERT_NUM_UNIT_NO;
			break;
		case (int) 'n':
			xfree(params.nodes);
			params.nodes = xstrdup(optarg);
			/*
			 * confirm valid nodelist entry
			 */
			host_list = hostlist_create(params.nodes);
			if (!host_list) {
				error("'%s' invalid entry for --nodes",
				      optarg);
				exit(1);
			}
			if (hostlist_count(host_list) == 1) {
				params.node_name_single = true;
				xfree(params.nodes);
				params.nodes =
				    hostlist_deranged_string_xmalloc(host_list);
			} else
				params.node_name_single = false;
			hostlist_destroy(host_list);
			break;
		case (int) 'N':
			params.node_flag = true;
			break;
		case (int) 'o':
			xfree(params.format);
			params.format = xstrdup(optarg);
			break;
		case (int) 'O':
			long_form = true;
			xfree(params.format);
			params.format = xstrdup(optarg);
			break;
		case (int) 'p':
			opt_p_set = true;
			xfree(params.partition);
			FREE_NULL_LIST(params.part_list);
			params.partition = xstrdup(optarg);
			params.part_list = _build_part_list(optarg);
			params.all_flag = true;
			break;
		case (int) 'r':
			params.responding_nodes = true;
			break;
		case (int) 'R':
			params.list_reasons = true;
			break;
		case (int) 's':
			params.summarize = true;
			break;
		case (int) 'S':
			xfree(params.sort);
			params.sort = xstrdup(optarg);
			break;
		case (int) 't':
			xfree(params.states);
			params.states = xstrdup(optarg);
			if (!(params.state_list = _build_state_list(optarg))) {
				error ("valid states: %s", _node_state_list ());
				exit (1);
			}
			break;
		case (int) 'T':
			params.reservation_flag = true;
			break;
		case (int) 'v':
			params.verbose++;
			break;
		case (int) 'V':
			print_slurm_version ();
			exit(0);
		case (int) OPT_LONG_FEDR:
			params.federation_flag = true;
			break;
		case (int) OPT_LONG_HELP:
			_help();
			exit(0);
		case (int) OPT_LONG_USAGE:
			_usage();
			exit(0);
		case OPT_LONG_HIDE:
			params.all_flag = false;
			break;
		case OPT_LONG_LOCAL:
			params.local = true;
			break;
		}
	}

	if (opt_a_set && opt_p_set) {
		error("Conflicting options, -a and -p, specified. "
		      "Please choose one or the other.");
		exit(1);
	}

	params.cluster_flags = slurmdb_setup_cluster_flags();

	if (params.federation_flag && !params.clusters && !params.local) {
		void *ptr = NULL;
		char *cluster_name = slurm_get_cluster_name();
		if (slurm_load_federation(&ptr) ||
		    !cluster_in_federation(ptr, cluster_name)) {
			/* Not in federation */
			params.local = true;
			slurm_destroy_federation_rec(ptr);
		} else {
			params.fed = (slurmdb_federation_rec_t *) ptr;
		}
		xfree(cluster_name);
	}

	if ( params.format == NULL ) {
		if ( params.summarize ) {
			params.part_field_flag = true;	/* compute size later */
			if (params.cluster_flags & CLUSTER_FLAG_BG)
				params.format = "%9P %.5a %.10l %.32F  %N";
			else
				params.format = "%9P %.5a %.10l %.16F  %N";
		} else if ( params.node_flag ) {
			params.node_field_flag = true;	/* compute size later */
			params.part_field_flag = true;	/* compute size later */
			params.format = params.long_output ?
			  "%N %.6D %.9P %.11T %.4c %.8z %.6m %.8d %.6w %.8f %20E" :
			  "%N %.6D %.9P %6t";

		} else if (params.list_reasons) {
			params.format = params.long_output ?
			  "%20E %12U %19H %6t %N" :
			  "%20E %9u %19H %N";

		} else if ((env_val = getenv ("SINFO_FORMAT"))) {
			params.format = xstrdup(env_val);


		} else if (params.fed) {
			params.part_field_flag = true;	/* compute size later */
			params.format = params.long_output ?
			  "%9P %8V %.5a %.10l %.10s %.4r %.8h %.10g %.6D %.11T %N" :
			  "%9P %8V %.5a %.10l %.6D %.6t %N";
		} else {
			params.part_field_flag = true;	/* compute size later */
			params.format = params.long_output ?
			  "%9P %.5a %.10l %.10s %.4r %.8h %.10g %.6D %.11T %N" :
			  "%9P %.5a %.10l %.6D %.6t %N";
		}
	}

	if (long_form)
		_parse_long_format(params.format);
	else
		_parse_format(params.format);

	if (params.list_reasons && (params.state_list == NULL)) {
		params.states = xstrdup("down,fail,drain,error");
		if (!(params.state_list = _build_state_list (params.states)))
			fatal ("Unable to build state list for -R!");
	}

	if (params.dead_nodes || params.nodes || params.partition ||
			params.responding_nodes ||params.state_list)
		params.filtering = true;

	if (params.verbose)
		_print_options();
}
예제 #10
0
/* build maps for task layout on nodes */
static int _init_task_layout(slurm_step_layout_t *step_layout,
			     const char *arbitrary_nodes,
			     uint16_t *cpus_per_node, uint32_t *cpu_count_reps,
			     uint16_t cpus_per_task,
			     uint16_t task_dist, uint16_t plane_size)
{
	int cpu_cnt = 0, cpu_inx = 0, i;
	uint32_t cluster_flags = slurmdb_setup_cluster_flags();

/* 	char *name = NULL; */
	uint16_t cpus[step_layout->node_cnt];

	if (step_layout->node_cnt == 0)
		return SLURM_ERROR;
	if (step_layout->tasks)	/* layout already completed */
		return SLURM_SUCCESS;

	if ((int)cpus_per_task < 1 || cpus_per_task == (uint16_t)NO_VAL)
		cpus_per_task = 1;

	step_layout->plane_size = plane_size;

	step_layout->tasks = xmalloc(sizeof(uint16_t)
				     * step_layout->node_cnt);
	step_layout->tids  = xmalloc(sizeof(uint32_t *)
				     * step_layout->node_cnt);
	if (!(cluster_flags & CLUSTER_FLAG_BG)) {
		hostlist_t hl = hostlist_create(step_layout->node_list);
		/* make sure the number of nodes we think we have
		 * is the correct number */
		i = hostlist_count(hl);
		if (step_layout->node_cnt > i)
			step_layout->node_cnt = i;
		hostlist_destroy(hl);
	}
	debug("laying out the %u tasks on %u hosts %s dist %u",
	      step_layout->task_cnt, step_layout->node_cnt,
	      step_layout->node_list, task_dist);
	if (step_layout->node_cnt < 1) {
		error("no hostlist given can't layout tasks");
		return SLURM_ERROR;
	}

	for (i=0; i<step_layout->node_cnt; i++) {
/* 		name = hostlist_shift(hl); */
/* 		if (!name) { */
/* 			error("hostlist incomplete for this job request"); */
/* 			hostlist_destroy(hl); */
/* 			return SLURM_ERROR; */
/* 		} */
/* 		debug2("host %d = %s", i, name); */
/* 		free(name); */
		cpus[i] = (cpus_per_node[cpu_inx] / cpus_per_task);
		if (cpus[i] == 0) {
			/* this can be a result of a heterogeneous allocation
			 * (e.g. 4 cpus on one node and 2 on the second with
			 *  cpus_per_task=3)  */
			cpus[i] = 1;
		}
		//info("got %d cpus", cpus[i]);
		if ((++cpu_cnt) >= cpu_count_reps[cpu_inx]) {
			/* move to next record */
			cpu_inx++;
			cpu_cnt = 0;
		}
	}

        if ((task_dist == SLURM_DIST_CYCLIC) ||
            (task_dist == SLURM_DIST_CYCLIC_CYCLIC) ||
            (task_dist == SLURM_DIST_CYCLIC_BLOCK))
		return _task_layout_cyclic(step_layout, cpus);
	else if (task_dist == SLURM_DIST_ARBITRARY
		&& !(cluster_flags & CLUSTER_FLAG_FE))
		return _task_layout_hostfile(step_layout, arbitrary_nodes);
        else if (task_dist == SLURM_DIST_PLANE)
                return _task_layout_plane(step_layout, cpus);
	else
		return _task_layout_block(step_layout, cpus);
}
예제 #11
0
/* use specific set run tasks on each host listed in hostfile
 * XXX: Need to handle over-subscribe.
 */
static int _task_layout_hostfile(slurm_step_layout_t *step_layout,
				 const char *arbitrary_nodes)
{
	int i=0, j, taskid = 0, task_cnt=0;
	hostlist_iterator_t itr = NULL, itr_task = NULL;
	char *host = NULL;
	char *host_task = NULL;
	hostlist_t job_alloc_hosts = NULL;
	hostlist_t step_alloc_hosts = NULL;

	debug2("job list is %s", step_layout->node_list);
	job_alloc_hosts = hostlist_create(step_layout->node_list);
	itr = hostlist_iterator_create(job_alloc_hosts);
	if (!arbitrary_nodes) {
		error("no hostlist given for arbitrary dist");
		return SLURM_ERROR;
	}

	debug2("list is %s", arbitrary_nodes);
	step_alloc_hosts = hostlist_create(arbitrary_nodes);
	if (hostlist_count(step_alloc_hosts) != step_layout->task_cnt) {
		error("Asked for %u tasks have %d in the nodelist.  "
		      "Check your nodelist, or set the -n option to be %d",
		      step_layout->task_cnt,
		      hostlist_count(step_alloc_hosts),
		      hostlist_count(step_alloc_hosts));
		return SLURM_ERROR;
	}
	itr_task = hostlist_iterator_create(step_alloc_hosts);
	while((host = hostlist_next(itr))) {
		step_layout->tasks[i] = 0;
		while((host_task = hostlist_next(itr_task))) {
			if (!strcmp(host, host_task)) {
				step_layout->tasks[i]++;
				task_cnt++;
			}
			free(host_task);
			if (task_cnt >= step_layout->task_cnt)
				break;
		}
		debug3("%s got %u tasks", host, step_layout->tasks[i]);
		if (step_layout->tasks[i] == 0)
			goto reset_hosts;
		step_layout->tids[i] = xmalloc(sizeof(uint32_t)
					       * step_layout->tasks[i]);
		taskid = 0;
		j = 0;
		hostlist_iterator_reset(itr_task);
		while((host_task = hostlist_next(itr_task))) {
			if (!strcmp(host, host_task)) {
				step_layout->tids[i][j] = taskid;
				j++;
			}
			taskid++;
			free(host_task);
			if (j >= step_layout->tasks[i])
				break;
		}
		i++;
	reset_hosts:
		hostlist_iterator_reset(itr_task);
		free(host);
		if (i > step_layout->task_cnt)
			break;
	}
	hostlist_iterator_destroy(itr);
	hostlist_iterator_destroy(itr_task);
	hostlist_destroy(job_alloc_hosts);
	hostlist_destroy(step_alloc_hosts);
	if (task_cnt != step_layout->task_cnt) {
		error("Asked for %u tasks but placed %d. Check your nodelist",
		      step_layout->task_cnt, task_cnt);
		return SLURM_ERROR;
	}

	return SLURM_SUCCESS;
}
예제 #12
0
/* use specific set run tasks on each host listed in hostfile
 * XXX: Need to handle over-subscribe.
 */
static int _task_layout_hostfile(slurm_step_layout_t *step_layout,
				 const char *arbitrary_nodes)
{
	int i=0, j, taskid = 0, task_cnt=0;
	hostlist_iterator_t itr = NULL, itr_task = NULL;
	char *host = NULL;

	hostlist_t job_alloc_hosts = NULL;
	hostlist_t step_alloc_hosts = NULL;

	int step_inx = 0, step_hosts_cnt = 0;
	struct node_record **step_hosts_ptrs = NULL;
	struct node_record *host_ptr = NULL;

	debug2("job list is %s", step_layout->node_list);
	if (!arbitrary_nodes) {
		error("no hostlist given for arbitrary dist");
		return SLURM_ERROR;
	}

	debug2("list is %s", arbitrary_nodes);
	step_alloc_hosts = hostlist_create(arbitrary_nodes);
	if (hostlist_count(step_alloc_hosts) != step_layout->task_cnt) {
		error("Asked for %u tasks have %d in the nodelist.  "
		      "Check your nodelist, or set the -n option to be %d",
		      step_layout->task_cnt,
		      hostlist_count(step_alloc_hosts),
		      hostlist_count(step_alloc_hosts));
		hostlist_destroy(step_alloc_hosts);
		return SLURM_ERROR;
	}

	job_alloc_hosts = hostlist_create(step_layout->node_list);
	itr             = hostlist_iterator_create(job_alloc_hosts);
	itr_task        = hostlist_iterator_create(step_alloc_hosts);

	/*
	 * Build array of pointers so that we can do pointer comparisons rather
	 * than strcmp's on nodes.
	 */
	step_hosts_cnt  = hostlist_count(step_alloc_hosts);
	step_hosts_ptrs = xmalloc(sizeof(struct node_record *) *
				  step_hosts_cnt);

	step_inx = 0;
	while((host = hostlist_next(itr_task))) {
		step_hosts_ptrs[step_inx++] = find_node_record_no_alias(host);
		free(host);
	}

	while((host = hostlist_next(itr))) {
		host_ptr = find_node_record(host);
		step_layout->tasks[i] = 0;

		for (step_inx = 0; step_inx < step_hosts_cnt; step_inx++) {
			if (host_ptr == step_hosts_ptrs[step_inx]) {
				step_layout->tasks[i]++;
				task_cnt++;
			}
			if (task_cnt >= step_layout->task_cnt)
				break;
		}
		debug3("%s got %u tasks", host, step_layout->tasks[i]);
		if (step_layout->tasks[i] == 0)
			goto reset_hosts;
		step_layout->tids[i] = xmalloc(sizeof(uint32_t)
					       * step_layout->tasks[i]);
		taskid = 0;
		j = 0;

		for (step_inx = 0; step_inx < step_hosts_cnt; step_inx++) {
			if (host_ptr == step_hosts_ptrs[step_inx]) {
				step_layout->tids[i][j] = taskid;
				j++;
			}
			taskid++;
			if (j >= step_layout->tasks[i])
				break;
		}
		i++;
	reset_hosts:
		free(host);
		if (i > step_layout->task_cnt)
			break;
	}
	hostlist_iterator_destroy(itr);
	hostlist_iterator_destroy(itr_task);
	hostlist_destroy(job_alloc_hosts);
	hostlist_destroy(step_alloc_hosts);
	xfree(step_hosts_ptrs);

	if (task_cnt != step_layout->task_cnt) {
		error("Asked for %u tasks but placed %d. Check your nodelist",
		      step_layout->task_cnt, task_cnt);
		return SLURM_ERROR;
	}

	return SLURM_SUCCESS;
}
예제 #13
0
/* build maps for task layout on nodes */
static int _init_task_layout(slurm_step_layout_req_t *step_layout_req,
			     slurm_step_layout_t *step_layout,
			     const char *arbitrary_nodes)
{
	int cpu_cnt = 0, cpu_inx = 0, cpu_task_cnt = 0, cpu_task_inx = 0, i;
	uint32_t cluster_flags = slurmdb_setup_cluster_flags();

	uint16_t cpus[step_layout->node_cnt];
	uint16_t cpus_per_task[1];
	uint32_t cpus_task_reps[1];

	if (step_layout->node_cnt == 0)
		return SLURM_ERROR;
	if (step_layout->tasks)	/* layout already completed */
		return SLURM_SUCCESS;

	if (!step_layout_req->cpus_per_task) {
		cpus_per_task[0] = 1;
		cpus_task_reps[0] = step_layout_req->num_hosts;
		step_layout_req->cpus_per_task = cpus_per_task;
		step_layout_req->cpus_task_reps = cpus_task_reps;
	}

	if (((int)step_layout_req->cpus_per_task[0] < 1) ||
	    (step_layout_req->cpus_per_task[0] == NO_VAL16)) {
		step_layout_req->cpus_per_task[0] = 1;
		step_layout_req->cpus_task_reps[0] = step_layout_req->num_hosts;
	}

	step_layout->plane_size = step_layout_req->plane_size;

	step_layout->tasks = xmalloc(sizeof(uint16_t)
				     * step_layout->node_cnt);
	step_layout->tids  = xmalloc(sizeof(uint32_t *)
				     * step_layout->node_cnt);
	if (!(cluster_flags & CLUSTER_FLAG_BG)) {
		hostlist_t hl = hostlist_create(step_layout->node_list);
		/* make sure the number of nodes we think we have
		 * is the correct number */
		i = hostlist_count(hl);
		if (step_layout->node_cnt > i)
			step_layout->node_cnt = i;
		hostlist_destroy(hl);
	}
	debug("laying out the %u tasks on %u hosts %s dist %u",
	      step_layout->task_cnt, step_layout->node_cnt,
	      step_layout->node_list, step_layout->task_dist);
	if (step_layout->node_cnt < 1) {
		error("no hostlist given can't layout tasks");
		return SLURM_ERROR;
	}

	/* hostlist_t hl = hostlist_create(step_layout->node_list); */
	for (i=0; i<step_layout->node_cnt; i++) {
		/* char *name = hostlist_shift(hl); */
		/* if (!name) { */
		/* 	error("hostlist incomplete for this job request"); */
		/* 	hostlist_destroy(hl); */
		/* 	return SLURM_ERROR; */
		/* } */
		/* debug2("host %d = %s", i, name); */
		/* free(name); */
		cpus[i] = (step_layout_req->cpus_per_node[cpu_inx] /
			   step_layout_req->cpus_per_task[cpu_task_inx]);
		if (cpus[i] == 0) {
			/* this can be a result of a heterogeneous allocation
			 * (e.g. 4 cpus on one node and 2 on the second with
			 *  step_layout_req->cpus_per_task=3)  */
			cpus[i] = 1;
		}

		if (step_layout->plane_size &&
		    (step_layout->plane_size != NO_VAL16) &&
		    ((step_layout->task_dist & SLURM_DIST_STATE_BASE)
		     != SLURM_DIST_PLANE)) {
			/* plane_size when dist != plane is used to
			   convey ntasks_per_node. Adjust the number
			   of cpus to reflect that.
			*/
			uint16_t cpus_per_node =
				step_layout->plane_size *
				step_layout_req->cpus_per_task[cpu_task_inx];
			if (cpus[i] > cpus_per_node)
				cpus[i] = cpus_per_node;
		}

		/* info("got %d cpus", cpus[i]); */
		if ((++cpu_cnt) >=
		    step_layout_req->cpu_count_reps[cpu_inx]) {
			/* move to next record */
			cpu_inx++;
			cpu_cnt = 0;
		}

		if ((++cpu_task_cnt) >=
		    step_layout_req->cpus_task_reps[cpu_task_inx]) {
			/* move to next record */
			cpu_task_inx++;
			cpu_task_cnt = 0;
		}
	}

	if ((step_layout->task_dist & SLURM_DIST_NODEMASK)
	    == SLURM_DIST_NODECYCLIC)
		return _task_layout_cyclic(step_layout, cpus);
	else if (((step_layout->task_dist & SLURM_DIST_STATE_BASE)
		  == SLURM_DIST_ARBITRARY)
		 && !(cluster_flags & CLUSTER_FLAG_FE))
		return _task_layout_hostfile(step_layout, arbitrary_nodes);
	else if ((step_layout->task_dist & SLURM_DIST_STATE_BASE)
		 == SLURM_DIST_PLANE)
		return _task_layout_plane(step_layout, cpus);
	else
		return _task_layout_block(step_layout, cpus);
}
예제 #14
0
파일: srun_job.c 프로젝트: VURM/slurm
/*
 * Create an srun job structure for a step w/out an allocation response msg.
 * (i.e. inside an allocation)
 */
srun_job_t *
job_step_create_allocation(resource_allocation_response_msg_t *resp)
{
	uint32_t job_id = resp->job_id;
	srun_job_t *job = NULL;
	allocation_info_t *ai = xmalloc(sizeof(*ai));
	hostlist_t hl = NULL;
	char *buf = NULL;
	int count = 0;
	uint32_t alloc_count = 0;

	ai->jobid          = job_id;
	ai->stepid         = NO_VAL;
	ai->nodelist = opt.alloc_nodelist;
	hl = hostlist_create(ai->nodelist);
	hostlist_uniq(hl);
	alloc_count = hostlist_count(hl);
	ai->nnodes = alloc_count;
	hostlist_destroy(hl);

	if (opt.exc_nodes) {
		hostlist_t exc_hl = hostlist_create(opt.exc_nodes);
		hostlist_t inc_hl = NULL;
		char *node_name = NULL;

		hl = hostlist_create(ai->nodelist);
		if(opt.nodelist) {
			inc_hl = hostlist_create(opt.nodelist);
		}
		hostlist_uniq(hl);
		//info("using %s or %s", opt.nodelist, ai->nodelist);
		while ((node_name = hostlist_shift(exc_hl))) {
			int inx = hostlist_find(hl, node_name);
			if (inx >= 0) {
				debug("excluding node %s", node_name);
				hostlist_delete_nth(hl, inx);
				ai->nnodes--;	/* decrement node count */
			}
			if(inc_hl) {
				inx = hostlist_find(inc_hl, node_name);
				if (inx >= 0) {
					error("Requested node %s is also "
					      "in the excluded list.",
					      node_name);
					error("Job not submitted.");
					hostlist_destroy(exc_hl);
					hostlist_destroy(inc_hl);
					goto error;
				}
			}
			free(node_name);
		}
		hostlist_destroy(exc_hl);

		/* we need to set this here so if there are more nodes
		 * available than we requested we can set it
		 * straight. If there is no exclude list then we set
		 * the vars then.
		 */
		if (!opt.nodes_set) {
			/* we don't want to set the number of nodes =
			 * to the number of requested processes unless we
			 * know it is less than the number of nodes
			 * in the allocation
			 */
			if(opt.ntasks_set && (opt.ntasks < ai->nnodes))
				opt.min_nodes = opt.ntasks;
			else
				opt.min_nodes = ai->nnodes;
			opt.nodes_set = true;
		}
		if(!opt.max_nodes)
			opt.max_nodes = opt.min_nodes;
		if((opt.max_nodes > 0) && (opt.max_nodes < ai->nnodes))
			ai->nnodes = opt.max_nodes;

		count = hostlist_count(hl);
		if(!count) {
			error("Hostlist is now nothing!  Can't run job.");
			hostlist_destroy(hl);
			goto error;
		}
		if(inc_hl) {
			count = hostlist_count(inc_hl);
			if(count < ai->nnodes) {
				/* add more nodes to get correct number for
				   allocation */
				hostlist_t tmp_hl = hostlist_copy(hl);
				int i=0;
				int diff = ai->nnodes - count;
				buf = hostlist_ranged_string_xmalloc(inc_hl);
				hostlist_delete(tmp_hl, buf);
				xfree(buf);
				while ((node_name = hostlist_shift(tmp_hl)) &&
				       (i < diff)) {
					hostlist_push(inc_hl, node_name);
					i++;
				}
				hostlist_destroy(tmp_hl);
			}
			buf = hostlist_ranged_string_xmalloc(inc_hl);
			hostlist_destroy(inc_hl);
			xfree(opt.nodelist);
			opt.nodelist = buf;
		} else {
			if (count > ai->nnodes) {
				/* remove more nodes than needed for
				   allocation */
				int i=0;
				for (i=count; i>ai->nnodes; i--)
					hostlist_delete_nth(hl, i);
			}
			xfree(opt.nodelist);
			opt.nodelist = hostlist_ranged_string_xmalloc(hl);
		}

		hostlist_destroy(hl);
	} else {
		if (!opt.nodes_set) {
			/* we don't want to set the number of nodes =
			 * to the number of requested processes unless we
			 * know it is less than the number of nodes
			 * in the allocation
			 */
			if(opt.ntasks_set && (opt.ntasks < ai->nnodes))
				opt.min_nodes = opt.ntasks;
			else
				opt.min_nodes = ai->nnodes;
			opt.nodes_set = true;
		}
		if(!opt.max_nodes)
			opt.max_nodes = opt.min_nodes;
		if((opt.max_nodes > 0) && (opt.max_nodes < ai->nnodes))
			ai->nnodes = opt.max_nodes;
		/* Don't reset the ai->nodelist because that is the
		 * nodelist we want to say the allocation is under
		 * opt.nodelist is what is used for the allocation.
		 */
		/* xfree(ai->nodelist); */
		/* ai->nodelist = xstrdup(buf); */
	}

	/* get the correct number of hosts to run tasks on */
	if (opt.nodelist) {
		hl = hostlist_create(opt.nodelist);
		if (opt.distribution != SLURM_DIST_ARBITRARY)
			hostlist_uniq(hl);
		if (!hostlist_count(hl)) {
			error("Hostlist is now nothing!  Can not run job.");
			hostlist_destroy(hl);
			goto error;
		}

		buf = hostlist_ranged_string_xmalloc(hl);
		count = hostlist_count(hl);
		hostlist_destroy(hl);
		/* Don't reset the ai->nodelist because that is the
		 * nodelist we want to say the allocation is under
		 * opt.nodelist is what is used for the allocation.
		 */
		/* xfree(ai->nodelist); */
		/* ai->nodelist = xstrdup(buf); */
		xfree(opt.nodelist);
		opt.nodelist = buf;
	}

	if (opt.distribution == SLURM_DIST_ARBITRARY) {
		if (count != opt.ntasks) {
			error("You asked for %d tasks but specified %d nodes",
			      opt.ntasks, count);
			goto error;
		}
	}

	if (ai->nnodes == 0) {
		error("No nodes in allocation, can't run job");
		goto error;
	}

	ai->num_cpu_groups = resp->num_cpu_groups;
	ai->cpus_per_node  = resp->cpus_per_node;
	ai->cpu_count_reps = resp->cpu_count_reps;

/* 	info("looking for %d nodes out of %s with a must list of %s", */
/* 	     ai->nnodes, ai->nodelist, opt.nodelist); */
	/*
	 * Create job
	 */
	job = _job_create_structure(ai);
error:
   	xfree(ai);
	return (job);

}
예제 #15
0
파일: forward.c 프로젝트: cread/slurm
void *_fwd_tree_thread(void *arg)
{
	fwd_tree_t *fwd_tree = (fwd_tree_t *)arg;
	List ret_list = NULL;
	char *name = NULL;
	char *buf = NULL;
	slurm_msg_t send_msg;

	slurm_msg_t_init(&send_msg);
	send_msg.msg_type = fwd_tree->orig_msg->msg_type;
	send_msg.data = fwd_tree->orig_msg->data;
	send_msg.protocol_version = fwd_tree->orig_msg->protocol_version;

	/* repeat until we are sure the message was sent */
	while ((name = hostlist_shift(fwd_tree->tree_hl))) {
		if (slurm_conf_get_addr(name, &send_msg.address)
		    == SLURM_ERROR) {
			error("fwd_tree_thread: can't find address for host "
			      "%s, check slurm.conf", name);
			slurm_mutex_lock(fwd_tree->tree_mutex);
			mark_as_failed_forward(&fwd_tree->ret_list, name,
					       SLURM_UNKNOWN_FORWARD_ADDR);
 			slurm_cond_signal(fwd_tree->notify);
			slurm_mutex_unlock(fwd_tree->tree_mutex);
			free(name);

			continue;
		}

		send_msg.forward.timeout = fwd_tree->timeout;
		if ((send_msg.forward.cnt = hostlist_count(fwd_tree->tree_hl))){
			buf = hostlist_ranged_string_xmalloc(
					fwd_tree->tree_hl);
			send_msg.forward.nodelist = buf;
		} else
			send_msg.forward.nodelist = NULL;

		if (send_msg.forward.nodelist && send_msg.forward.nodelist[0]) {
			debug3("Tree sending to %s along with %s",
			       name, send_msg.forward.nodelist);
		} else
			debug3("Tree sending to %s", name);

		ret_list = slurm_send_addr_recv_msgs(&send_msg, name,
						     fwd_tree->timeout);

		xfree(send_msg.forward.nodelist);

		if (ret_list) {
			int ret_cnt = list_count(ret_list);
			/* This is most common if a slurmd is running
			   an older version of Slurm than the
			   originator of the message.
			*/
			if ((ret_cnt <= send_msg.forward.cnt) &&
			    (errno != SLURM_COMMUNICATIONS_CONNECTION_ERROR)) {
				error("fwd_tree_thread: %s failed to forward "
				      "the message, expecting %d ret got only "
				      "%d",
				      name, send_msg.forward.cnt + 1, ret_cnt);
				if (ret_cnt > 1) { /* not likely */
					ret_data_info_t *ret_data_info = NULL;
					ListIterator itr =
						list_iterator_create(ret_list);
					while ((ret_data_info =
						list_next(itr))) {
						if (xstrcmp(ret_data_info->
							    node_name, name))
							hostlist_delete_host(
								fwd_tree->
								tree_hl,
								ret_data_info->
								node_name);
					}
					list_iterator_destroy(itr);
				}
			}

			slurm_mutex_lock(fwd_tree->tree_mutex);
			list_transfer(fwd_tree->ret_list, ret_list);
			slurm_cond_signal(fwd_tree->notify);
			slurm_mutex_unlock(fwd_tree->tree_mutex);
			FREE_NULL_LIST(ret_list);
			/* try next node */
			if (ret_cnt <= send_msg.forward.cnt) {
				free(name);
				/* Abandon tree. This way if all the
				 * nodes in the branch are down we
				 * don't have to time out for each
				 * node serially.
				 */
				_start_msg_tree_internal(
					fwd_tree->tree_hl, NULL,
					fwd_tree,
					hostlist_count(fwd_tree->tree_hl));
				continue;
			}
		} else {
			/* This should never happen (when this was
			 * written slurm_send_addr_recv_msgs always
			 * returned a list */
			error("fwd_tree_thread: no return list given from "
			      "slurm_send_addr_recv_msgs spawned for %s",
			      name);
			slurm_mutex_lock(fwd_tree->tree_mutex);
			mark_as_failed_forward(
				&fwd_tree->ret_list, name,
				SLURM_COMMUNICATIONS_CONNECTION_ERROR);
 			slurm_cond_signal(fwd_tree->notify);
			slurm_mutex_unlock(fwd_tree->tree_mutex);
			free(name);

			continue;
		}

		free(name);

		/* check for error and try again */
		if (errno == SLURM_COMMUNICATIONS_CONNECTION_ERROR)
 			continue;

		break;
	}

	_destroy_tree_fwd(fwd_tree);

	return NULL;
}
예제 #16
0
/*
 * Based on ideas provided by Hongjia Cao <*****@*****.**> in PMI2 plugin
 */
int pmixp_coll_init(pmixp_coll_t *coll, const pmix_proc_t *procs,
		    size_t nprocs, pmixp_coll_type_t type)
{
	hostlist_t hl;
	int max_depth, width, depth, i;
	char *p;

#ifndef NDEBUG
	coll->magic = PMIXP_COLL_STATE_MAGIC;
#endif
	coll->type = type;
	coll->state = PMIXP_COLL_SYNC;
	coll->pset.procs = xmalloc(sizeof(*procs) * nprocs);
	coll->pset.nprocs = nprocs;
	memcpy(coll->pset.procs, procs, sizeof(*procs) * nprocs);

	if (SLURM_SUCCESS != _hostset_from_ranges(procs, nprocs, &hl)) {
		/* TODO: provide ranges output routine */
		PMIXP_ERROR("Bad ranges information");
		goto err_exit;
	}
#ifdef PMIXP_COLL_DEBUG
	/* if we debug collectives - store a copy of a full
	 * hostlist to resolve participant id to the hostname */
	coll->peers_hl = hostlist_copy(hl);
#endif

	width = slurm_get_tree_width();
	coll->peers_cnt = hostlist_count(hl);
	coll->my_peerid = hostlist_find(hl, pmixp_info_hostname());
	reverse_tree_info(coll->my_peerid, coll->peers_cnt, width,
			  &coll->prnt_peerid, &coll->chldrn_cnt, &depth,
			  &max_depth);

	/* We interested in amount of direct childs */
	coll->seq = 0;
	coll->contrib_children = 0;
	coll->contrib_local = false;
	coll->chldrn_ids = xmalloc(sizeof(int) * width);
	coll->contrib_chld = xmalloc(sizeof(int) * width);
	coll->chldrn_cnt = reverse_tree_direct_children(coll->my_peerid,
							coll->peers_cnt,
							  width, depth,
							  coll->chldrn_ids);
	if (coll->prnt_peerid == -1) {
		/* if we are the root of the tree:
		 * - we don't have a parent;
		 * - we have large list of all_childrens (we don't want
		 * ourselfs there)
		 */
		coll->prnt_host = NULL;
		coll->all_chldrn_hl = hostlist_copy(hl);
		hostlist_delete_host(coll->all_chldrn_hl,
				     pmixp_info_hostname());
		coll->chldrn_str =
			hostlist_ranged_string_xmalloc(coll->all_chldrn_hl);
	} else {
		/* for all other nodes in the tree we need to know:
		 * - nodename of our parent;
		 * - we don't need a list of all_childrens and hl anymore
		 */

		/*
		 * setup parent id's
		 */
		p = hostlist_nth(hl, coll->prnt_peerid);
		coll->prnt_host = xstrdup(p);
		free(p);
		/* reset prnt_peerid to the global peer */
		coll->prnt_peerid = pmixp_info_job_hostid(coll->prnt_host);

		/*
		 * setup root id's
		 * (we need this for the SLURM API communication case)
		 */
		p = hostlist_nth(hl, 0);
		coll->root_host = xstrdup(p);
		free(p);
		/* reset prnt_peerid to the global peer */
		coll->root_peerid = pmixp_info_job_hostid(coll->root_host);

		/* use empty hostlist here */
		coll->all_chldrn_hl = hostlist_create("");
		coll->chldrn_str = NULL;
	}

	/* fixup children peer ids to te global ones */
	for(i=0; i<coll->chldrn_cnt; i++){
		p = hostlist_nth(hl, coll->chldrn_ids[i]);
		coll->chldrn_ids[i] = pmixp_info_job_hostid(p);
		free(p);
	}
	hostlist_destroy(hl);

	/* Collective state */
	coll->ufwd_buf = pmixp_server_buf_new();
	coll->dfwd_buf = pmixp_server_buf_new();
	_reset_coll_ufwd(coll);
	_reset_coll_dfwd(coll);
	coll->cbdata = NULL;
	coll->cbfunc = NULL;

	/* init fine grained lock */
	slurm_mutex_init(&coll->lock);

	return SLURM_SUCCESS;
err_exit:
	return SLURM_ERROR;
}
예제 #17
0
파일: forward.c 프로젝트: cread/slurm
void *_forward_thread(void *arg)
{
	forward_msg_t *fwd_msg = (forward_msg_t *)arg;
	forward_struct_t *fwd_struct = fwd_msg->fwd_struct;
	Buf buffer = init_buf(BUF_SIZE);	/* probably enough for header */
	List ret_list = NULL;
	int fd = -1;
	ret_data_info_t *ret_data_info = NULL;
	char *name = NULL;
	hostlist_t hl = hostlist_create(fwd_msg->header.forward.nodelist);
	slurm_addr_t addr;
	char *buf = NULL;
	int steps = 0;
	int start_timeout = fwd_msg->timeout;

	/* repeat until we are sure the message was sent */
	while ((name = hostlist_shift(hl))) {
		if (slurm_conf_get_addr(name, &addr) == SLURM_ERROR) {
			error("forward_thread: can't find address for host "
			      "%s, check slurm.conf", name);
			slurm_mutex_lock(&fwd_struct->forward_mutex);
			mark_as_failed_forward(&fwd_struct->ret_list, name,
					       SLURM_UNKNOWN_FORWARD_ADDR);
 			free(name);
			if (hostlist_count(hl) > 0) {
				slurm_mutex_unlock(&fwd_struct->forward_mutex);
				continue;
			}
			goto cleanup;
		}
		if ((fd = slurm_open_msg_conn(&addr)) < 0) {
			error("forward_thread to %s: %m", name);

			slurm_mutex_lock(&fwd_struct->forward_mutex);
			mark_as_failed_forward(
				&fwd_struct->ret_list, name,
				SLURM_COMMUNICATIONS_CONNECTION_ERROR);
			free(name);
			if (hostlist_count(hl) > 0) {
				slurm_mutex_unlock(&fwd_struct->forward_mutex);
				/* Abandon tree. This way if all the
				 * nodes in the branch are down we
				 * don't have to time out for each
				 * node serially.
				 */
				_forward_msg_internal(hl, NULL, fwd_struct,
						      &fwd_msg->header, 0,
						      hostlist_count(hl));
				continue;
			}
			goto cleanup;
		}
		buf = hostlist_ranged_string_xmalloc(hl);

		xfree(fwd_msg->header.forward.nodelist);
		fwd_msg->header.forward.nodelist = buf;
		fwd_msg->header.forward.cnt = hostlist_count(hl);
#if 0
		info("sending %d forwards (%s) to %s",
		     fwd_msg->header.forward.cnt,
		     fwd_msg->header.forward.nodelist, name);
#endif
		if (fwd_msg->header.forward.nodelist[0]) {
			debug3("forward: send to %s along with %s",
			       name, fwd_msg->header.forward.nodelist);
		} else
			debug3("forward: send to %s ", name);

		pack_header(&fwd_msg->header, buffer);

		/* add forward data to buffer */
		if (remaining_buf(buffer) < fwd_struct->buf_len) {
			int new_size = buffer->processed + fwd_struct->buf_len;
			new_size += 1024; /* padded for paranoia */
			xrealloc_nz(buffer->head, new_size);
			buffer->size = new_size;
		}
		if (fwd_struct->buf_len) {
			memcpy(&buffer->head[buffer->processed],
			       fwd_struct->buf, fwd_struct->buf_len);
			buffer->processed += fwd_struct->buf_len;
		}

		/*
		 * forward message
		 */
		if (slurm_msg_sendto(fd,
				     get_buf_data(buffer),
				     get_buf_offset(buffer),
				     SLURM_PROTOCOL_NO_SEND_RECV_FLAGS ) < 0) {
			error("forward_thread: slurm_msg_sendto: %m");

			slurm_mutex_lock(&fwd_struct->forward_mutex);
			mark_as_failed_forward(&fwd_struct->ret_list, name,
					       errno);
			free(name);
			if (hostlist_count(hl) > 0) {
				free_buf(buffer);
				buffer = init_buf(fwd_struct->buf_len);
				slurm_mutex_unlock(&fwd_struct->forward_mutex);
				close(fd);
				fd = -1;
				/* Abandon tree. This way if all the
				 * nodes in the branch are down we
				 * don't have to time out for each
				 * node serially.
				 */
				_forward_msg_internal(hl, NULL, fwd_struct,
						      &fwd_msg->header, 0,
						      hostlist_count(hl));
				continue;
			}
			goto cleanup;
		}

		/* These messages don't have a return message, but if
		 * we got here things worked out so make note of the
		 * list of nodes as success.
		 */
		if ((fwd_msg->header.msg_type == REQUEST_SHUTDOWN) ||
		    (fwd_msg->header.msg_type == REQUEST_RECONFIGURE) ||
		    (fwd_msg->header.msg_type == REQUEST_REBOOT_NODES)) {
			slurm_mutex_lock(&fwd_struct->forward_mutex);
			ret_data_info = xmalloc(sizeof(ret_data_info_t));
			list_push(fwd_struct->ret_list, ret_data_info);
			ret_data_info->node_name = xstrdup(name);
			free(name);
			while ((name = hostlist_shift(hl))) {
				ret_data_info =
					xmalloc(sizeof(ret_data_info_t));
				list_push(fwd_struct->ret_list, ret_data_info);
				ret_data_info->node_name = xstrdup(name);
				free(name);
			}
			goto cleanup;
		}

		if (fwd_msg->header.forward.cnt > 0) {
			static int message_timeout = -1;
			if (message_timeout < 0)
				message_timeout =
					slurm_get_msg_timeout() * 1000;
			if (!fwd_msg->header.forward.tree_width)
				fwd_msg->header.forward.tree_width =
					slurm_get_tree_width();
			steps = (fwd_msg->header.forward.cnt+1) /
					fwd_msg->header.forward.tree_width;
			fwd_msg->timeout = (message_timeout*steps);
			/* info("got %d * %d = %d", message_timeout, */
			/*      steps, fwd_msg->timeout); */
			steps++;
			fwd_msg->timeout += (start_timeout*steps);
			/* info("now  + %d*%d = %d", start_timeout, */
			/*      steps, fwd_msg->timeout); */
		}

		ret_list = slurm_receive_msgs(fd, steps, fwd_msg->timeout);
		/* info("sent %d forwards got %d back", */
		/*      fwd_msg->header.forward.cnt, list_count(ret_list)); */

		if (!ret_list || (fwd_msg->header.forward.cnt != 0
				  && list_count(ret_list) <= 1)) {
			slurm_mutex_lock(&fwd_struct->forward_mutex);
			mark_as_failed_forward(&fwd_struct->ret_list, name,
					       errno);
			free(name);
			FREE_NULL_LIST(ret_list);
			if (hostlist_count(hl) > 0) {
				free_buf(buffer);
				buffer = init_buf(fwd_struct->buf_len);
				slurm_mutex_unlock(&fwd_struct->forward_mutex);
				close(fd);
				fd = -1;
				continue;
			}
			goto cleanup;
		} else if ((fwd_msg->header.forward.cnt+1)
			  != list_count(ret_list)) {
			/* this should never be called since the above
			   should catch the failed forwards and pipe
			   them back down, but this is here so we
			   never have to worry about a locked
			   mutex */
			ListIterator itr = NULL;
			char *tmp = NULL;
			int first_node_found = 0;
			hostlist_iterator_t host_itr
				= hostlist_iterator_create(hl);
			error("We shouldn't be here.  We forwarded to %d "
			      "but only got %d back",
			      (fwd_msg->header.forward.cnt+1),
			      list_count(ret_list));
			while ((tmp = hostlist_next(host_itr))) {
				int node_found = 0;
				itr = list_iterator_create(ret_list);
				while ((ret_data_info = list_next(itr))) {
					if (!ret_data_info->node_name) {
						first_node_found = 1;
						ret_data_info->node_name =
							xstrdup(name);
					}
					if (!xstrcmp(tmp,
						   ret_data_info->node_name)) {
						node_found = 1;
						break;
					}
				}
				list_iterator_destroy(itr);
				if (!node_found) {
					mark_as_failed_forward(
						&fwd_struct->ret_list,
						tmp,
						SLURM_COMMUNICATIONS_CONNECTION_ERROR);
				}
				free(tmp);
			}
			hostlist_iterator_destroy(host_itr);
			if (!first_node_found) {
				mark_as_failed_forward(
					&fwd_struct->ret_list,
					name,
					SLURM_COMMUNICATIONS_CONNECTION_ERROR);
			}
		}
		break;
	}
	slurm_mutex_lock(&fwd_struct->forward_mutex);
	if (ret_list) {
		while ((ret_data_info = list_pop(ret_list)) != NULL) {
			if (!ret_data_info->node_name) {
				ret_data_info->node_name = xstrdup(name);
			}
			list_push(fwd_struct->ret_list, ret_data_info);
			debug3("got response from %s",
			       ret_data_info->node_name);
		}
		FREE_NULL_LIST(ret_list);
	}
	free(name);
cleanup:
	if ((fd >= 0) && close(fd) < 0)
		error ("close(%d): %m", fd);
	hostlist_destroy(hl);
	destroy_forward(&fwd_msg->header.forward);
	free_buf(buffer);
	slurm_cond_signal(&fwd_struct->notify);
	slurm_mutex_unlock(&fwd_struct->forward_mutex);
	xfree(fwd_msg);

	return (NULL);
}
예제 #18
0
static int
eliminate_nodes (char **hosts)
{
  hostlist_t hl = NULL;
  hostlist_t hlnew = NULL;
  hostlist_iterator_t hitr = NULL;
  ipmidetect_t id = NULL;
  char *host = NULL;
  char hostbuf[HOSTLIST_BUFLEN + 1];
  int rv = -1;

  assert (hosts);
  assert (*hosts);

  if (!(id = ipmidetect_handle_create ()))
    {
      fprintf (stderr,
               "ipmidetect_handle_create\n");
      goto cleanup;
    }

  if (ipmidetect_load_data (id,
                            NULL,
                            0,
                            0) < 0)
    {
      if (ipmidetect_errnum (id) == IPMIDETECT_ERR_CONNECT
          || ipmidetect_errnum (id) == IPMIDETECT_ERR_CONNECT_TIMEOUT)
        fprintf (stderr,
                 "Error connecting to ipmidetect daemon\n");
      else
        fprintf (stderr,
                 "ipmidetect_load_data: %s\n", ipmidetect_errormsg (id));
      goto cleanup;
    }

  if (!(hl = hostlist_create (*hosts)))
    {
      fprintf (stderr,
               "hostlist_create: %s\n",
               strerror (errno));
      goto cleanup;
    }

  if (!(hlnew = hostlist_create (*hosts)))
    {
      fprintf (stderr,
               "hostlist_create: %s\n",
               strerror (errno));
      goto cleanup;
    }

  if (!(hitr = hostlist_iterator_create (hl)))
    {
      fprintf (stderr,
               "hostlist_iterator_create: %s\n",
               strerror (errno));
      goto cleanup;
    }

  while ((host = hostlist_next (hitr)))
    {
      int ret;

      if ((ret = ipmidetect_is_node_detected (id, host)) < 0)
        {
          if (ipmidetect_errnum (id) == IPMIDETECT_ERR_NOTFOUND)
            fprintf (stderr,
                     "Node '%s' unrecognized by ipmidetect\n", host);
          else
            fprintf (stderr,
                     "ipmidetect_is_node_detected: %s\n", ipmidetect_errormsg (id));
          goto cleanup;
        }

      if (!ret)
        hostlist_delete (hlnew, host);

      free (host);
    }
  host = NULL;

  if (!hostlist_count (hlnew))
    {
      rv = 0;
      goto cleanup;
    }
 
  memset (hostbuf, '\0', HOSTLIST_BUFLEN + 1);
 
  if (hostlist_ranged_string (hlnew, HOSTLIST_BUFLEN, hostbuf) < 0)
    {
      fprintf (stderr,
               "hostlist_ranged_string: truncation\n");
      goto cleanup;
    }

  free (*hosts);
  if (!(*hosts = strdup (hostbuf)))
    {
      fprintf (stderr, "strdup: %s\n", strerror (errno));
      goto cleanup;
    }

  rv = hostlist_count (hlnew);
 cleanup:
  if (id)
    ipmidetect_handle_destroy (id);
  if (hitr)
    hostlist_iterator_destroy (hitr);
  if (hl)
    hostlist_destroy (hl);
  if (hlnew)
    hostlist_destroy (hlnew);
  free (host);
  return (rv);
}
예제 #19
0
파일: allocate.c 프로젝트: VURM/slurm
/*
 * Read a SLURM hostfile specified by "filename".  "filename" must contain
 * a list of SLURM NodeNames, one per line.  Reads up to "n" number of hostnames
 * from the file. Returns a string representing a hostlist ranged string of
 * the contents of the file.  This is a helper function, it does not
 * contact any SLURM daemons.
 *
 * Returns a string representing the hostlist.  Returns NULL if there are fewer
 * than "n" hostnames in the file, or if an error occurs.  If "n" ==
 * NO_VAL then the entire file is read in
 *
 * Returned string must be freed with free().
 */
char *slurm_read_hostfile(char *filename, int n)
{
	FILE *fp = NULL;
	char in_line[BUFFER_SIZE];	/* input line */
	int i, j;
	int line_size;
	int line_num = 0;
	hostlist_t hostlist = NULL;
	char *nodelist = NULL;

	if (filename == NULL || strlen(filename) == 0)
		return NULL;

	if ((fp = fopen(filename, "r")) == NULL) {
		error("slurm_allocate_resources error opening file %s, %m",
		      filename);
		return NULL;
	}

	hostlist = hostlist_create(NULL);
	if (hostlist == NULL) {
		fclose(fp);
		return NULL;
	}

	while (fgets(in_line, BUFFER_SIZE, fp) != NULL) {
		line_num++;
		line_size = strlen(in_line);
		if (line_size == (BUFFER_SIZE - 1)) {
			error ("Line %d, of hostfile %s too long",
			       line_num, filename);
			fclose (fp);
			hostlist_destroy(hostlist);
			return NULL;
		}

		for (i = 0; i < line_size; i++) {
			if (in_line[i] == '\n') {
				in_line[i] = '\0';
				break;
			}
			if (in_line[i] == '\0')
				break;
			if (in_line[i] != '#')
				continue;
			if ((i > 0) && (in_line[i - 1] == '\\')) {
				for (j = i; j < line_size; j++) {
					in_line[j - 1] = in_line[j];
				}
				line_size--;
				continue;
			}
			in_line[i] = '\0';
			break;
		}

		hostlist_push(hostlist, in_line);
		if (n != (int)NO_VAL && hostlist_count(hostlist) == n)
			break;
	}
	fclose(fp);

	if (hostlist_count(hostlist) <= 0) {
		error("Hostlist is empty!");
		goto cleanup_hostfile;
	}
	if (hostlist_count(hostlist) < n) {
		error("Too few NodeNames in SLURM Hostfile");
		goto cleanup_hostfile;
	}

	nodelist = (char *)malloc(0xffff);
	if (!nodelist) {
		error("Nodelist xmalloc failed");
		goto cleanup_hostfile;
	}

	if (hostlist_ranged_string(hostlist, 0xffff, nodelist) == -1) {
		error("Hostlist is too long for the allocate RPC!");
		free(nodelist);
		nodelist = NULL;
		goto cleanup_hostfile;
	}

	debug2("Hostlist from SLURM_HOSTFILE = %s", nodelist);

cleanup_hostfile:
	hostlist_destroy(hostlist);

	return nodelist;
}
예제 #20
0
파일: gres_mps.c 프로젝트: SchedMD/slurm
/*
 * Convert all MPS records to a new entries in a list where each File is a
 * unique device (i.e. convert a record with "File=nvidia[0-3]" into 4 separate
 * records). Similar to _build_gpu_list(), but we copy more fields, divide the
 * "Count" across all MPS records and remove from the original list.
 */
static List _build_mps_list(List gres_list)
{
	ListIterator itr;
	gres_slurmd_conf_t *gres_record, *mps_record;
	List mps_list;
	hostlist_t hl;
	char *f_name;
	uint64_t count_per_file;
	int mps_no_file_recs = 0, mps_file_recs = 0;

	if (gres_list == NULL)
		return NULL;

	mps_list = list_create(_delete_gres_list);
	itr = list_iterator_create(gres_list);
	while ((gres_record = list_next(itr))) {
		if (xstrcmp(gres_record->name, "mps"))
			continue;
		if (!gres_record->file) {
			if (mps_no_file_recs)
				fatal("gres/mps: bad configuration, multiple configurations without \"File\"");
			if (mps_file_recs)
				fatal("gres/mps: multiple configurations with and without \"File\"");
			mps_no_file_recs++;
			mps_record = xmalloc(sizeof(gres_slurmd_conf_t));
			mps_record->config_flags = gres_record->config_flags;
			if (gres_record->type_name)
				mps_record->config_flags |= GRES_CONF_HAS_TYPE;
			mps_record->count = gres_record->count;
			mps_record->cpu_cnt = gres_record->cpu_cnt;
			mps_record->cpus = xstrdup(gres_record->cpus);
			if (gres_record->cpus_bitmap) {
				mps_record->cpus_bitmap =
					bit_copy(gres_record->cpus_bitmap);
			}
			mps_record->name = xstrdup(gres_record->name);
			mps_record->plugin_id = gres_record->plugin_id;
			mps_record->type_name = xstrdup(gres_record->type_name);
			list_append(mps_list, mps_record);
		} else {
			mps_file_recs++;
			if (mps_no_file_recs)
				fatal("gres/mps: multiple configurations with and without \"File\"");
			hl = hostlist_create(gres_record->file);
			count_per_file = gres_record->count/hostlist_count(hl);
			while ((f_name = hostlist_shift(hl))) {
				mps_record =xmalloc(sizeof(gres_slurmd_conf_t));
				mps_record->config_flags =
					gres_record->config_flags;
				if (gres_record->type_name) {
					mps_record->config_flags |=
						GRES_CONF_HAS_TYPE;
				}
				mps_record->count = count_per_file;
				mps_record->cpu_cnt = gres_record->cpu_cnt;
				mps_record->cpus = xstrdup(gres_record->cpus);
				if (gres_record->cpus_bitmap) {
					mps_record->cpus_bitmap =
					     bit_copy(gres_record->cpus_bitmap);
				}
				mps_record->file = xstrdup(f_name);
				mps_record->name = xstrdup(gres_record->name);
				mps_record->plugin_id = gres_record->plugin_id;
				mps_record->type_name =
					xstrdup(gres_record->type_name);
				list_append(mps_list, mps_record);
				free(f_name);
			}
			hostlist_destroy(hl);
		}
		(void) list_delete_item(itr);
	}
	list_iterator_destroy(itr);

	return mps_list;
}
예제 #21
0
/*
 * _set_collectors call the split_hostlist API on the all nodes hostlist
 * to set the node to be used as a collector for unsolicited node aggregation.
 *
 * If this node is a forwarding node (first node in any hostlist),
 * then its collector and backup are the ControlMachine and it's backup.
 *
 * Otherwise, we find the hostlist containing this node.
 * The forwarding node in that hostlist becomes a collector, the next node
 * which is not this node becomes the backup.
 * That list is split, we iterate through it and searching for a list in
 * which this node is a forwarding node. If found, we set the collector and
 * backup, else this process is repeated.
 */
static void _set_collectors(char *this_node_name)
{
	slurm_ctl_conf_t *conf;
	hostlist_t  nodes;
	hostlist_t* hll = NULL;
	char *parent = NULL, *backup = NULL;
	char addrbuf[32];
	int i, j, f;
	int hl_count = 0;
	uint16_t parent_port;
	uint16_t backup_port;
	bool found = false;
	bool ctldparent = true;

#ifdef HAVE_FRONT_END
	return; /* on a FrontEnd system this would never be useful. */
#endif

	if (!run_in_daemon("slurmd"))
		return; /* Only compute nodes have collectors */

	/* Set the initial iteration, collector is controller,
	 * full list is split */
	xassert(this_node_name);

	conf = slurm_conf_lock();
	nodes = _get_all_nodes();
	parent = strdup(conf->control_addr);
	if (conf->backup_addr) {
		backup = strdup(conf->backup_addr);
	}
	parent_port = conf->slurmctld_port;
	backup_port = parent_port;
	slurm_conf_unlock();
	while (!found) {
		if ( route_g_split_hostlist(nodes, &hll, &hl_count) ) {
			error("unable to split forward hostlist");
			goto clean; /* collector addrs remains null */
		}
		/* Find which hostlist contains this node */
		for (i=0; i < hl_count; i++) {
			f = hostlist_find(hll[i], this_node_name);
			if (f != -1)
				break;
		}
		if (i == hl_count) {
			fatal("ROUTE -- %s not found in node_record_table",
			      this_node_name);
		}
		if (f == 0) {
			/* we are a forwarded to node,
			 * so our parent is parent */
			if (hostlist_count(hll[i]) > 1)
				this_is_collector = true;
			xfree(msg_collect_node);
			msg_collect_node = xmalloc(sizeof(slurm_addr_t));
			if (ctldparent)
				slurm_set_addr(msg_collect_node, parent_port,
					       parent);
			else {
				slurm_conf_get_addr(parent, msg_collect_node);
				msg_collect_node->sin_port = htons(parent_port);
			}
			if (debug_flags & DEBUG_FLAG_ROUTE) {
				slurm_print_slurm_addr(msg_collect_node,
						       addrbuf, 32);
				info("ROUTE -- message collector address is %s",
				     addrbuf);
			}
			xfree(msg_collect_backup);
			if (backup) {
				msg_collect_backup =
					xmalloc(sizeof(slurm_addr_t));
				if (ctldparent) {
					slurm_set_addr(msg_collect_backup,
						       backup_port, backup);
				} else {
					slurm_conf_get_addr(backup,
							    msg_collect_backup);
					msg_collect_backup->sin_port =
						htons(backup_port);
				}
				if (debug_flags & DEBUG_FLAG_ROUTE) {
					slurm_print_slurm_addr(
						msg_collect_backup,
						addrbuf, 32);
					info("ROUTE -- message collector backup"
					     " address is %s", addrbuf);
				}
			} else {
				if (debug_flags & DEBUG_FLAG_ROUTE) {
					info("ROUTE -- no message collector "
					     "backup");
				}

			}
			found = true;
			goto clean;
		}

		/* We are not a forwarding node, the first node in this list
		 * will split the forward_list.
		 * We also know that the forwarding node is not a controller.
		 *
		 * clean up parent context */
		ctldparent = false;
		hostlist_destroy(nodes);
		if (parent)
			free(parent);
		if (backup)
			free(backup);
		nodes = hostlist_copy(hll[i]);
		for (j=0; j < hl_count; j++) {
			hostlist_destroy(hll[j]);
		}
		xfree(hll);

		/* set our parent, backup, and continue search */
		parent = hostlist_shift(nodes);
		backup = hostlist_nth(nodes, 0);
		if (strcmp(backup, this_node_name) == 0) {
			free(backup);
			backup = NULL;
			if (hostlist_count(nodes) > 1)
				backup = hostlist_nth(nodes, 1);
		}
		parent_port =  slurm_conf_get_port(parent);
		if (backup) {
			backup_port = slurm_conf_get_port(backup);
		} else
			backup_port = 0;

	}
clean:
	if (debug_flags & DEBUG_FLAG_ROUTE) {
		if (this_is_collector)
			info("ROUTE -- %s is a collector node", this_node_name);
		else
			info("ROUTE -- %s is a leaf node", this_node_name);
	}
	hostlist_destroy(nodes);
	if (parent)
		free(parent);
	if (backup)
		free(backup);
	for (i=0; i < hl_count; i++) {
		hostlist_destroy(hll[i]);
	}
	xfree(hll);
}
예제 #22
0
파일: print.c 프로젝트: npe9/slurm
void print_fields(type_t type, void *object)
{
	if (!object) {
		fatal ("Job or step record is NULL");
		return;
	}

	slurmdb_job_rec_t *job = (slurmdb_job_rec_t *)object;
	slurmdb_step_rec_t *step = (slurmdb_step_rec_t *)object;
	jobcomp_job_rec_t *job_comp = (jobcomp_job_rec_t *)object;
	print_field_t *field = NULL;
	int curr_inx = 1;
	struct passwd *pw = NULL;
	struct	group *gr = NULL;
	char outbuf[FORMAT_STRING_SIZE];
	bool got_stats = false;
	int cpu_tres_rec_count = 0;
	int step_cpu_tres_rec_count = 0;

	switch(type) {
	case JOB:
		step = NULL;
		if (!job->track_steps)
			step = (slurmdb_step_rec_t *)job->first_step_ptr;
		/* set this to avoid printing out info for things that
		   don't mean anything.  Like an allocation that never
		   ran anything.
		*/
		if (!step)
			job->track_steps = 1;
		else
			step_cpu_tres_rec_count =
				slurmdb_find_tres_count_in_string(
					step->tres_alloc_str, TRES_CPU);

		if (job->stats.cpu_min != NO_VAL)
			got_stats = true;

		job_comp = NULL;

		cpu_tres_rec_count = slurmdb_find_tres_count_in_string(
			job->tres_alloc_str, TRES_CPU);
		break;
	case JOBSTEP:
		job = step->job_ptr;

		if (step->stats.cpu_min != NO_VAL)
			got_stats = true;

		if (!(step_cpu_tres_rec_count =
		      slurmdb_find_tres_count_in_string(
			      step->tres_alloc_str, TRES_CPU)))
			step_cpu_tres_rec_count =
				slurmdb_find_tres_count_in_string(
					job->tres_alloc_str, TRES_CPU);

		job_comp = NULL;
		break;
	case JOBCOMP:
		job = NULL;
		step = NULL;
		break;
	default:
		break;
	}

	list_iterator_reset(print_fields_itr);
	while((field = list_next(print_fields_itr))) {
		char *tmp_char = NULL, id[FORMAT_STRING_SIZE];
		int tmp_int = NO_VAL, tmp_int2 = NO_VAL;
		double tmp_dub = (double)NO_VAL;
		uint32_t tmp_uint32 = (uint32_t)NO_VAL;
		uint64_t tmp_uint64 = (uint64_t)NO_VAL;

		memset(&outbuf, 0, sizeof(outbuf));
		switch(field->type) {
		case PRINT_ALLOC_CPUS:
			switch(type) {
			case JOB:
				tmp_int = cpu_tres_rec_count;

				// we want to use the step info
				if (!step)
					break;
			case JOBSTEP:
				tmp_int = step_cpu_tres_rec_count;
				break;
			case JOBCOMP:
			default:
				tmp_int = job_comp->proc_cnt;
				break;
			}
			field->print_routine(field,
					     tmp_int,
					     (curr_inx == field_count));
			break;
		case PRINT_ALLOC_GRES:
			switch(type) {
			case JOB:
				tmp_char = job->alloc_gres;
				break;
			case JOBSTEP:
				tmp_char = step->job_ptr->alloc_gres;
				break;
			case JOBCOMP:
			default:
				tmp_char = NULL;
				break;
			}
			field->print_routine(field,
					     tmp_char,
					     (curr_inx == field_count));
			break;
		case PRINT_ACCOUNT:
			switch(type) {
			case JOB:
				tmp_char = job->account;
				break;
			case JOBSTEP:
				tmp_char = step->job_ptr->account;
				break;
			case JOBCOMP:
			default:
				tmp_char = "n/a";
				break;
			}
			field->print_routine(field,
					     tmp_char,
					     (curr_inx == field_count));
			break;
		case PRINT_ACT_CPUFREQ:
			if (got_stats) {
				switch (type) {
				case JOB:
					if (!job->track_steps)
						tmp_dub =
							step->stats.act_cpufreq;
					break;
				case JOBSTEP:
					tmp_dub = step->stats.act_cpufreq;
					break;
				default:
					break;
				}
			}
			if (!fuzzy_equal(tmp_dub, NO_VAL))
				_local_convert_num_unit2((double)tmp_dub,
							 outbuf, sizeof(outbuf),
							 UNIT_KILO, 1000,
							 false);

			field->print_routine(field,
					     outbuf,
					     (curr_inx == field_count));
			break;
		case PRINT_ASSOCID:
			switch(type) {
			case JOB:
				tmp_int = job->associd;
				break;
			case JOBSTEP:
				tmp_int = step->job_ptr->associd;
				break;
			case JOBCOMP:
			default:
				tmp_int = NO_VAL;
				break;
			}
			field->print_routine(field,
					     tmp_int,
					     (curr_inx == field_count));
			break;
		case PRINT_AVECPU:
			if (got_stats) {
				switch(type) {
				case JOB:
					if (!job->track_steps)
						tmp_dub = job->stats.cpu_ave;
					break;
				case JOBSTEP:
					tmp_dub = step->stats.cpu_ave;
					break;
				case JOBCOMP:
				default:
					break;
				}
			}

			if (!fuzzy_equal(tmp_dub, NO_VAL))
				tmp_char = _elapsed_time((long)tmp_dub, 0);

			field->print_routine(field,
					     tmp_char,
					     (curr_inx == field_count));
			xfree(tmp_char);
			break;
		case PRINT_AVEDISKREAD:
			if (got_stats) {
				switch(type) {
				case JOB:
					if (!job->track_steps)
						tmp_dub = job->
							stats.disk_read_ave;
					break;
				case JOBSTEP:
					tmp_dub = step->stats.disk_read_ave;
					break;
				case JOBCOMP:
				default:
					break;
				}
			}
			_print_small_double(outbuf, sizeof(outbuf),
					    tmp_dub, UNIT_MEGA);

			field->print_routine(field,
					     outbuf,
					     (curr_inx == field_count));
			break;
		case PRINT_AVEDISKWRITE:
			if (got_stats) {
				switch(type) {
				case JOB:
					if (!job->track_steps)
						tmp_dub = job->
							stats.disk_write_ave;
					break;
				case JOBSTEP:
					tmp_dub = step->stats.disk_write_ave;
					break;
				case JOBCOMP:
				default:
					break;
				}
			}
			_print_small_double(outbuf, sizeof(outbuf),
					    tmp_dub, UNIT_MEGA);

			field->print_routine(field,
					     outbuf,
					     (curr_inx == field_count));
			break;
		case PRINT_AVEPAGES:
			if (got_stats) {
				switch(type) {
				case JOB:
					if (!job->track_steps)
						tmp_dub = job->stats.pages_ave;
					break;
				case JOBSTEP:
					tmp_dub = step->stats.pages_ave;
					break;
				case JOBCOMP:
				default:
					break;
				}
			}
			if (!fuzzy_equal(tmp_dub, NO_VAL))
				_local_convert_num_unit((double)tmp_dub, outbuf,
							sizeof(outbuf),
							UNIT_KILO);

			field->print_routine(field,
					     outbuf,
					     (curr_inx == field_count));
			break;
		case PRINT_AVERSS:
			if (got_stats) {
				switch(type) {
				case JOB:
					if (!job->track_steps)
						tmp_dub = job->stats.rss_ave;
					break;
				case JOBSTEP:
					tmp_dub = step->stats.rss_ave;
					break;
				case JOBCOMP:
				default:
					break;
				}
			}
			if (!fuzzy_equal(tmp_dub, NO_VAL))
				_local_convert_num_unit((double)tmp_dub, outbuf,
							sizeof(outbuf),
							UNIT_KILO);

			field->print_routine(field,
					     outbuf,
					     (curr_inx == field_count));
			break;
		case PRINT_AVEVSIZE:
			if (got_stats) {
				switch(type) {
				case JOB:
					if (!job->track_steps)
						tmp_dub = job->stats.vsize_ave;
					break;
				case JOBSTEP:
					tmp_dub = step->stats.vsize_ave;
					break;
				case JOBCOMP:
				default:
					break;
				}
			}

			if (!fuzzy_equal(tmp_dub, NO_VAL))
				_local_convert_num_unit((double)tmp_dub, outbuf,
							sizeof(outbuf),
							UNIT_KILO);

			field->print_routine(field,
					     outbuf,
					     (curr_inx == field_count));
			break;
		case PRINT_BLOCKID:
			switch(type) {
			case JOB:
				tmp_char = job->blockid;
				break;
			case JOBSTEP:
				break;
			case JOBCOMP:
				tmp_char = job_comp->blockid;
				break;
			default:
				break;
			}
			field->print_routine(field,
					     tmp_char,
					     (curr_inx == field_count));
			break;
		case PRINT_CLUSTER:
			switch(type) {
			case JOB:
				tmp_char = job->cluster;
				break;
			case JOBSTEP:
				tmp_char = step->job_ptr->cluster;
				break;
			case JOBCOMP:
			default:
				break;
			}
			field->print_routine(field,
					     tmp_char,
					     (curr_inx == field_count));
			break;
		case PRINT_COMMENT:
			switch(type) {
			case JOB:
				tmp_char = job->derived_es;
				break;
			case JOBSTEP:
			case JOBCOMP:
			default:
				tmp_char = NULL;
				break;
			}
			field->print_routine(field,
					     tmp_char,
					     (curr_inx == field_count));
			break;
		case PRINT_CONSUMED_ENERGY:
			if (got_stats) {
				switch (type) {
				case JOB:
					if (!job->track_steps)
						tmp_dub = step->
							stats.consumed_energy;
					break;
				case JOBSTEP:
					tmp_dub = step->stats.consumed_energy;
					break;
				default:
					break;
				}
			}
			if (!fuzzy_equal(tmp_dub, NO_VAL))
				_local_convert_num_unit2((double)tmp_dub,
							 outbuf, sizeof(outbuf),
							 UNIT_NONE, 1000,
							 false);

			field->print_routine(field,
					     outbuf,
					     (curr_inx == field_count));
			break;
		case PRINT_CONSUMED_ENERGY_RAW:
			if (got_stats) {
				switch (type) {
				case JOB:
					if (!job->track_steps)
						tmp_dub = step->
							stats.consumed_energy;
					break;
				case JOBSTEP:
					tmp_dub = step->stats.consumed_energy;
					break;
				default:
					break;
				}
			}

			field->print_routine(field,
					     tmp_dub,
					     (curr_inx == field_count));
			break;
		case PRINT_CPU_TIME:
			switch(type) {
			case JOB:
				tmp_uint64 = (uint64_t)job->elapsed
					* (uint64_t)cpu_tres_rec_count;
				break;
			case JOBSTEP:
				tmp_uint64 = (uint64_t)step->elapsed
					* (uint64_t)step_cpu_tres_rec_count;
				break;
			case JOBCOMP:
				break;
			default:
				break;
			}
			field->print_routine(field,
					     tmp_uint64,
					     (curr_inx == field_count));
			break;
		case PRINT_CPU_TIME_RAW:
			switch(type) {
			case JOB:
				tmp_uint64 = (uint64_t)job->elapsed
					* (uint64_t)cpu_tres_rec_count;
				break;
			case JOBSTEP:
				tmp_uint64 = (uint64_t)step->elapsed
					* (uint64_t)step_cpu_tres_rec_count;
				break;
			case JOBCOMP:
				break;
			default:
				break;
			}
			field->print_routine(field,
					     tmp_uint64,
					     (curr_inx == field_count));
			break;
		case PRINT_DERIVED_EC:
			tmp_int2 = 0;
			switch(type) {
			case JOB:
				tmp_int = job->derived_ec;
				if (tmp_int == NO_VAL)
					tmp_int = 0;
				if (WIFSIGNALED(tmp_int))
					tmp_int2 = WTERMSIG(tmp_int);

				snprintf(outbuf, sizeof(outbuf), "%d:%d",
					 WEXITSTATUS(tmp_int), tmp_int2);
				break;
			case JOBSTEP:
			case JOBCOMP:
			default:
				outbuf[0] = '\0';
				break;
			}

			field->print_routine(field,
					     outbuf,
					     (curr_inx == field_count));
			break;
		case PRINT_ELAPSED:
			switch(type) {
			case JOB:
				tmp_int = job->elapsed;
				break;
			case JOBSTEP:
				tmp_int = step->elapsed;
				break;
			case JOBCOMP:
				tmp_int = job_comp->elapsed_time;
				break;
			default:
				tmp_int = NO_VAL;
				break;
			}
			field->print_routine(field,
					     (uint64_t)tmp_int,
					     (curr_inx == field_count));
			break;
		case PRINT_ELIGIBLE:
			switch(type) {
			case JOB:
				tmp_int = job->eligible;
				break;
			case JOBSTEP:
				tmp_int = step->start;
				break;
			case JOBCOMP:
				break;
			default:
				break;
			}
			field->print_routine(field,
					     tmp_int,
					     (curr_inx == field_count));
			break;
		case PRINT_END:
			switch(type) {
			case JOB:
				tmp_int = job->end;
				break;
			case JOBSTEP:
				tmp_int = step->end;
				break;
			case JOBCOMP:
				tmp_int = parse_time(job_comp->end_time, 1);
				break;
			default:
				tmp_int = NO_VAL;
				break;
			}
			field->print_routine(field,
					     tmp_int,
					     (curr_inx == field_count));
			break;
		case PRINT_EXITCODE:
			tmp_int = 0;
			tmp_int2 = 0;
			switch(type) {
			case JOB:
				tmp_int = job->exitcode;
				break;
			case JOBSTEP:
				tmp_int = step->exitcode;
				break;
			case JOBCOMP:
			default:
				break;
			}
			if (tmp_int != NO_VAL) {
				if (WIFSIGNALED(tmp_int))
					tmp_int2 = WTERMSIG(tmp_int);
				tmp_int = WEXITSTATUS(tmp_int);
				if (tmp_int >= 128)
					tmp_int -= 128;
			}
			snprintf(outbuf, sizeof(outbuf), "%d:%d",
				 tmp_int, tmp_int2);

			field->print_routine(field,
					     outbuf,
					     (curr_inx == field_count));
			break;
		case PRINT_GID:
			switch(type) {
			case JOB:
				tmp_int = job->gid;
				break;
			case JOBSTEP:
				tmp_int = NO_VAL;
				break;
			case JOBCOMP:
				tmp_int = job_comp->gid;
				break;
			default:
				tmp_int = NO_VAL;
				break;
			}
			field->print_routine(field,
					     tmp_int,
					     (curr_inx == field_count));
			break;
		case PRINT_GROUP:
			switch(type) {
			case JOB:
				tmp_int = job->gid;
				break;
			case JOBSTEP:
				tmp_int = NO_VAL;
				break;
			case JOBCOMP:
				tmp_int = job_comp->gid;
				break;
			default:
				tmp_int = NO_VAL;
				break;
			}
			tmp_char = NULL;
			if ((gr=getgrgid(tmp_int)))
				tmp_char=gr->gr_name;

			field->print_routine(field,
					     tmp_char,
					     (curr_inx == field_count));
			break;
		case PRINT_JOBID:
			if (type == JOBSTEP)
				job = step->job_ptr;

			if (job) {
				if (job->array_task_str) {
					_xlate_task_str(job);
					snprintf(id, FORMAT_STRING_SIZE,
						 "%u_[%s]",
						 job->array_job_id,
						 job->array_task_str);
				} else if (job->array_task_id != NO_VAL)
					snprintf(id, FORMAT_STRING_SIZE,
						 "%u_%u",
						 job->array_job_id,
						 job->array_task_id);
				else
					snprintf(id, FORMAT_STRING_SIZE,
						 "%u",
						 job->jobid);
			}

			switch (type) {
			case JOB:
				tmp_char = xstrdup(id);
				break;
			case JOBSTEP:
				if (step->stepid == SLURM_BATCH_SCRIPT) {
					tmp_char = xstrdup_printf(
						"%s.batch", id);
				} else if (step->stepid == SLURM_EXTERN_CONT) {
					tmp_char = xstrdup_printf(
						"%s.extern", id);
				} else {
					tmp_char = xstrdup_printf(
						"%s.%u",
						id, step->stepid);
				}
				break;
			case JOBCOMP:
				tmp_char = xstrdup_printf("%u",
							  job_comp->jobid);
				break;
			default:
				break;
			}
			field->print_routine(field,
					     tmp_char,
					     (curr_inx == field_count));
			xfree(tmp_char);
			break;
		case PRINT_JOBIDRAW:
			switch (type) {
			case JOB:
				tmp_char = xstrdup_printf("%u", job->jobid);
				break;
			case JOBSTEP:
				if (step->stepid == SLURM_BATCH_SCRIPT) {
					tmp_char = xstrdup_printf(
						"%u.batch",
						step->job_ptr->jobid);
				} else if (step->stepid == SLURM_EXTERN_CONT) {
					tmp_char = xstrdup_printf(
						"%u.extern",
						step->job_ptr->jobid);
				} else {
					tmp_char = xstrdup_printf(
						"%u.%u",
						step->job_ptr->jobid,
						step->stepid);
				}
				break;
			case JOBCOMP:
				tmp_char = xstrdup_printf("%u",
							  job_comp->jobid);
				break;
			default:
				break;
			}
			field->print_routine(field,
					     tmp_char,
					     (curr_inx == field_count));
			xfree(tmp_char);
			break;
		case PRINT_JOBNAME:
			switch(type) {
			case JOB:
				tmp_char = job->jobname;
				break;
			case JOBSTEP:
				tmp_char = step->stepname;
				break;
			case JOBCOMP:
				tmp_char = job_comp->jobname;
				break;
			default:
				tmp_char = NULL;
				break;
			}
			field->print_routine(field,
					     tmp_char,
					     (curr_inx == field_count));
			break;
		case PRINT_LAYOUT:
			switch(type) {
			case JOB:
				/* below really should be step.  It is
				   not a typo */
				if (!job->track_steps)
					tmp_char = slurm_step_layout_type_name(
						step->task_dist);
				break;
			case JOBSTEP:
				tmp_char = slurm_step_layout_type_name(
					step->task_dist);
				break;
			case JOBCOMP:
				break;
			default:
				tmp_char = NULL;
				break;
			}
			field->print_routine(field,
					     tmp_char,
					     (curr_inx == field_count));
			break;
		case PRINT_MAXDISKREAD:
			if (got_stats) {
				switch(type) {
				case JOB:
					if (!job->track_steps)
						tmp_dub = job->
							stats.disk_read_max;
					break;
				case JOBSTEP:
					tmp_dub = step->stats.disk_read_max;
					break;
				case JOBCOMP:
				default:
					break;
				}
			}
			_print_small_double(outbuf, sizeof(outbuf),
					    tmp_dub, UNIT_MEGA);

			field->print_routine(field,
					     outbuf,
					     (curr_inx == field_count));
			break;
		case PRINT_MAXDISKREADNODE:
			if (got_stats) {
				switch(type) {
				case JOB:
					if (!job->track_steps)
						tmp_char = find_hostname(
							job->stats.
							disk_read_max_nodeid,
							job->nodes);
					break;
				case JOBSTEP:
					tmp_char = find_hostname(
						step->stats.
						disk_read_max_nodeid,
						step->nodes);
					break;
				case JOBCOMP:
				default:
					tmp_char = NULL;
					break;
				}
			}
			field->print_routine(field,
					     tmp_char,
					     (curr_inx == field_count));
			xfree(tmp_char);
			break;
		case PRINT_MAXDISKREADTASK:
			if (got_stats) {
				switch(type) {
				case JOB:
					if (!job->track_steps)
						tmp_uint32 =
							job->stats.
							disk_read_max_taskid;
					break;
				case JOBSTEP:
					tmp_uint32 = step->stats.
						disk_read_max_taskid;
					break;
				case JOBCOMP:
				default:
					tmp_uint32 = NO_VAL;
					break;
				}
			}
			if (tmp_uint32 == (uint32_t)NO_VAL)
				tmp_uint32 = NO_VAL;
			field->print_routine(field,
					     tmp_uint32,
					     (curr_inx == field_count));
			break;
		case PRINT_MAXDISKWRITE:
			if (got_stats) {
				switch(type) {
				case JOB:
					if (!job->track_steps)
						tmp_dub = job->stats.
							disk_write_max;
					break;
				case JOBSTEP:
					tmp_dub = step->stats.disk_write_max;
					break;
				case JOBCOMP:
				default:
					break;
				}
			}
			_print_small_double(outbuf, sizeof(outbuf),
					    tmp_dub, UNIT_MEGA);

			field->print_routine(field,
					     outbuf,
					     (curr_inx == field_count));
			break;
		case PRINT_MAXDISKWRITENODE:
			if (got_stats) {
				switch(type) {
				case JOB:
					if (!job->track_steps)
						tmp_char = find_hostname(
							job->stats.
							disk_write_max_nodeid,
							job->nodes);
					break;
				case JOBSTEP:
					tmp_char = find_hostname(
						step->stats.
						disk_write_max_nodeid,
						step->nodes);
					break;
				case JOBCOMP:
				default:
					tmp_char = NULL;
					break;
				}
			}
			field->print_routine(field,
					     tmp_char,
					     (curr_inx == field_count));
			xfree(tmp_char);
			break;
		case PRINT_MAXDISKWRITETASK:
			if (got_stats) {
				switch(type) {
				case JOB:
					if (!job->track_steps)
						tmp_uint32 =
							job->stats.
							disk_write_max_taskid;
					break;
				case JOBSTEP:
					tmp_uint32 = step->stats.
						disk_write_max_taskid;
					break;
				case JOBCOMP:
				default:
					tmp_uint32 = NO_VAL;
					break;
				}
				if (tmp_uint32 == (uint32_t)NO_VAL)
					tmp_uint32 = NO_VAL;
			}
			field->print_routine(field,
					     tmp_uint32,
					     (curr_inx == field_count));
			break;
		case PRINT_MAXPAGES:
			if (got_stats) {
				switch(type) {
				case JOB:
					if (!job->track_steps)
						tmp_uint64 =
							job->stats.pages_max;
					break;
				case JOBSTEP:
					tmp_uint64 = step->stats.pages_max;
					break;
				case JOBCOMP:
				default:
					break;
				}
				if (tmp_uint64 != (uint64_t)NO_VAL)
					_local_convert_num_unit(
							(double)tmp_uint64,
							outbuf, sizeof(outbuf),
							UNIT_KILO);
			}

			field->print_routine(field,
					     outbuf,
					     (curr_inx == field_count));
			break;
		case PRINT_MAXPAGESNODE:
			if (got_stats) {
				switch(type) {
				case JOB:
					if (!job->track_steps)
						tmp_char = find_hostname(
							job->stats.
							pages_max_nodeid,
							job->nodes);
					break;
				case JOBSTEP:
					tmp_char = find_hostname(
						step->stats.pages_max_nodeid,
						step->nodes);
					break;
				case JOBCOMP:
				default:
					tmp_char = NULL;
					break;
				}
			}
			field->print_routine(field,
					     tmp_char,
					     (curr_inx == field_count));
			xfree(tmp_char);
			break;
		case PRINT_MAXPAGESTASK:
			if (got_stats) {
				switch(type) {
				case JOB:
					if (!job->track_steps)
						tmp_uint32 =
							job->stats.
							pages_max_taskid;
					break;
				case JOBSTEP:
					tmp_uint32 = step->stats.
						pages_max_taskid;
					break;
				case JOBCOMP:
				default:
					tmp_uint32 = NO_VAL;
					break;
				}
				if (tmp_uint32 == (uint32_t)NO_VAL)
					tmp_uint32 = NO_VAL;
			}

			field->print_routine(field,
					     tmp_uint32,
					     (curr_inx == field_count));
			break;
		case PRINT_MAXRSS:
			if (got_stats) {
				switch(type) {
				case JOB:
					if (!job->track_steps)
						tmp_uint64 = job->stats.rss_max;
					break;
				case JOBSTEP:
					tmp_uint64 = step->stats.rss_max;
					break;
				case JOBCOMP:
				default:
					break;
				}
				if (tmp_uint64 != (uint64_t)NO_VAL)
					_local_convert_num_unit(
							(double)tmp_uint64,
							outbuf, sizeof(outbuf),
							UNIT_KILO);
			}

			field->print_routine(field,
					     outbuf,
					     (curr_inx == field_count));
			break;
		case PRINT_MAXRSSNODE:
			if (got_stats) {
				switch(type) {
				case JOB:
					if (!job->track_steps)
						tmp_char = find_hostname(
							job->stats.
							rss_max_nodeid,
							job->nodes);
					break;
				case JOBSTEP:
					tmp_char = find_hostname(
						step->stats.rss_max_nodeid,
						step->nodes);
					break;
				case JOBCOMP:
				default:
					tmp_char = NULL;
					break;
				}
			}

			field->print_routine(field,
					     tmp_char,
					     (curr_inx == field_count));
			xfree(tmp_char);
			break;
		case PRINT_MAXRSSTASK:
			if (got_stats) {
				switch(type) {
				case JOB:
					if (!job->track_steps)
						tmp_uint32 = job->stats.
							rss_max_taskid;
					break;
				case JOBSTEP:
					tmp_uint32 = step->stats.rss_max_taskid;
					break;
				case JOBCOMP:
				default:
					tmp_uint32 = NO_VAL;
					break;
				}
				if (tmp_uint32 == (uint32_t)NO_VAL)
					tmp_uint32 = NO_VAL;
			}

			field->print_routine(field,
					     tmp_uint32,
					     (curr_inx == field_count));
			break;
		case PRINT_MAXVSIZE:
			if (got_stats) {
				switch(type) {
				case JOB:
					if (!job->track_steps)
						tmp_uint64 = job->stats.
							vsize_max;
					break;
				case JOBSTEP:
					tmp_uint64 = step->stats.vsize_max;
					break;
				case JOBCOMP:
				default:
					tmp_uint64 = (uint64_t)NO_VAL;
					break;
				}

				if (tmp_uint64 != (uint64_t)NO_VAL)
					_local_convert_num_unit(
							(double)tmp_uint64,
							 outbuf, sizeof(outbuf),
							 UNIT_KILO);
			}

			field->print_routine(field,
					     outbuf,
					     (curr_inx == field_count));
			break;
		case PRINT_MAXVSIZENODE:
			if (got_stats) {
				switch(type) {
				case JOB:
					if (!job->track_steps)
						tmp_char = find_hostname(
							job->stats.
							vsize_max_nodeid,
							job->nodes);
					break;
				case JOBSTEP:
					tmp_char = find_hostname(
						step->stats.vsize_max_nodeid,
						step->nodes);
					break;
				case JOBCOMP:
				default:
					tmp_char = NULL;
					break;
				}
			}

			field->print_routine(field,
					     tmp_char,
					     (curr_inx == field_count));
			xfree(tmp_char);
			break;
		case PRINT_MAXVSIZETASK:
			if (got_stats) {
				switch(type) {
				case JOB:
					if (!job->track_steps)
						tmp_uint32 =
							job->stats.
							vsize_max_taskid;
					break;
				case JOBSTEP:
					tmp_uint32 = step->stats.
						vsize_max_taskid;
					break;
				case JOBCOMP:
				default:
					tmp_uint32 = NO_VAL;
					break;
				}
				if (tmp_uint32 == (uint32_t)NO_VAL)
					tmp_uint32 = NO_VAL;
			}

			field->print_routine(field,
					     tmp_uint32,
					     (curr_inx == field_count));
			break;
		case PRINT_MINCPU:
			if (got_stats) {
				switch(type) {
				case JOB:
					if (!job->track_steps)
						tmp_dub = job->stats.cpu_min;
					break;
				case JOBSTEP:
					tmp_dub = step->stats.cpu_min;
					break;
				case JOBCOMP:
				default:
					break;
				}
				if (!fuzzy_equal(tmp_dub, NO_VAL))
					tmp_char = _elapsed_time(
						(long)tmp_dub, 0);
			}

			field->print_routine(field,
					     tmp_char,
					     (curr_inx == field_count));
			xfree(tmp_char);
			break;
		case PRINT_MINCPUNODE:
			if (got_stats) {
				switch(type) {
				case JOB:
					if (!job->track_steps)
						tmp_char = find_hostname(
							job->stats.
							cpu_min_nodeid,
							job->nodes);
					break;
				case JOBSTEP:
					tmp_char = find_hostname(
						step->stats.cpu_min_nodeid,
						step->nodes);
					break;
				case JOBCOMP:
				default:
					tmp_char = NULL;
					break;
				}
			}

			field->print_routine(field,
					     tmp_char,
					     (curr_inx == field_count));
			xfree(tmp_char);
			break;
		case PRINT_MINCPUTASK:
			if (got_stats) {
				switch(type) {
				case JOB:
					if (!job->track_steps)
						tmp_uint32 = job->stats.
							cpu_min_taskid;
					break;
				case JOBSTEP:
					tmp_uint32 = step->stats.cpu_min_taskid;
					break;
				case JOBCOMP:
				default:
					tmp_uint32 = NO_VAL;
					break;
				}
				if (tmp_uint32 == (uint32_t)NO_VAL)
					tmp_uint32 = NO_VAL;
			}

			field->print_routine(field,
					     tmp_uint32,
					     (curr_inx == field_count));
			break;
		case PRINT_NODELIST:
			switch(type) {
			case JOB:
				tmp_char = job->nodes;
				break;
			case JOBSTEP:
				tmp_char = step->nodes;
				break;
			case JOBCOMP:
				tmp_char = job_comp->nodelist;
				break;
			default:
				break;
			}
			field->print_routine(field,
					     tmp_char,
					     (curr_inx == field_count));
			break;
		case PRINT_NNODES:
			switch(type) {
			case JOB:
				tmp_int = job->alloc_nodes;
				tmp_char = job->nodes;
				break;
			case JOBSTEP:
				tmp_int = step->nnodes;
				tmp_char = step->nodes;
				break;
			case JOBCOMP:
				tmp_int = job_comp->node_cnt;
				tmp_char = job_comp->nodelist;
				break;
			default:
				break;
			}

			if (!tmp_int) {
				hostlist_t hl = hostlist_create(tmp_char);
				tmp_int = hostlist_count(hl);
				hostlist_destroy(hl);
			}
			_local_convert_num_unit((double)tmp_int, outbuf,
						sizeof(outbuf), UNIT_NONE);
			field->print_routine(field,
					     outbuf,
					     (curr_inx == field_count));
			break;
		case PRINT_NTASKS:
			switch(type) {
			case JOB:
				if (!job->track_steps && !step)
					tmp_int = cpu_tres_rec_count;
				// we want to use the step info
				if (!step)
					break;
			case JOBSTEP:
				tmp_int = step->ntasks;
				break;
			case JOBCOMP:
			default:

				break;
			}
			field->print_routine(field,
					     tmp_int,
					     (curr_inx == field_count));
			break;
		case PRINT_PRIO:
			switch(type) {
			case JOB:
				tmp_int = job->priority;
				break;
			case JOBSTEP:

				break;
			case JOBCOMP:

				break;
			default:

				break;
			}
			field->print_routine(field,
					     tmp_int,
					     (curr_inx == field_count));
			break;
		case PRINT_PARTITION:
			switch(type) {
			case JOB:
				tmp_char = job->partition;
				break;
			case JOBSTEP:

				break;
			case JOBCOMP:
				tmp_char = job_comp->partition;
				break;
			default:

				break;
			}
			field->print_routine(field,
					     tmp_char,
					     (curr_inx == field_count));
			break;
		case PRINT_QOS:
			switch(type) {
			case JOB:
				tmp_int = job->qosid;
				break;
			case JOBSTEP:

				break;
			case JOBCOMP:

				break;
			default:

				break;
			}
			if (!g_qos_list) {
				slurmdb_qos_cond_t qos_cond;
				memset(&qos_cond, 0,
				       sizeof(slurmdb_qos_cond_t));
				qos_cond.with_deleted = 1;
				g_qos_list = slurmdb_qos_get(
					acct_db_conn, &qos_cond);
			}

			tmp_char = _find_qos_name_from_list(g_qos_list,
							    tmp_int);
			field->print_routine(field,
					     tmp_char,
					     (curr_inx == field_count));
			break;
		case PRINT_QOSRAW:
			switch(type) {
			case JOB:
				tmp_int = job->qosid;
				break;
			case JOBSTEP:

				break;
			case JOBCOMP:

				break;
			default:

				break;
			}
			field->print_routine(field,
					     tmp_int,
					     (curr_inx == field_count));
			break;
		case PRINT_REQ_CPUFREQ_MIN:
			switch (type) {
			case JOB:
				if (!job->track_steps && !step)
					tmp_dub = NO_VAL;
				// we want to use the step info
				if (!step)
					break;
			case JOBSTEP:
				tmp_dub = step->req_cpufreq_min;
				break;
			default:
				break;
			}
			cpu_freq_to_string(outbuf, sizeof(outbuf), tmp_dub);
			field->print_routine(field,
					     outbuf,
					     (curr_inx == field_count));
			break;
		case PRINT_REQ_CPUFREQ_MAX:
			switch (type) {
			case JOB:
				if (!job->track_steps && !step)
					tmp_dub = NO_VAL;
				// we want to use the step info
				if (!step)
					break;
			case JOBSTEP:
				tmp_dub = step->req_cpufreq_max;
				break;
			default:
				break;
			}
			cpu_freq_to_string(outbuf, sizeof(outbuf), tmp_dub);
			field->print_routine(field,
					     outbuf,
					     (curr_inx == field_count));
			break;
		case PRINT_REQ_CPUFREQ_GOV:
			switch (type) {
			case JOB:
				if (!job->track_steps && !step)
					tmp_dub = NO_VAL;
				// we want to use the step info
				if (!step)
					break;
			case JOBSTEP:
				tmp_dub = step->req_cpufreq_gov;
				break;
			default:
				break;
			}
			cpu_freq_to_string(outbuf, sizeof(outbuf), tmp_dub);
			field->print_routine(field,
					     outbuf,
					     (curr_inx == field_count));
			break;
		case PRINT_REQ_CPUS:
			switch(type) {
			case JOB:
				tmp_int = job->req_cpus;
				break;
			case JOBSTEP:
				tmp_int = step_cpu_tres_rec_count;
				break;
			case JOBCOMP:

				break;
			default:

				break;
			}
			field->print_routine(field,
					     tmp_int,
					     (curr_inx == field_count));
			break;
		case PRINT_REQ_GRES:
			switch(type) {
			case JOB:
				tmp_char = job->req_gres;
				break;
			case JOBSTEP:
				tmp_char = step->job_ptr->req_gres;
				break;
			case JOBCOMP:
			default:
				tmp_char = NULL;
				break;
			}
			field->print_routine(field,
					     tmp_char,
					     (curr_inx == field_count));
			break;
		case PRINT_REQ_MEM:
			switch(type) {
			case JOB:
				tmp_uint32 = job->req_mem;
				break;
			case JOBSTEP:
				tmp_uint32 = step->job_ptr->req_mem;
				break;
			case JOBCOMP:
			default:
				tmp_uint32 = NO_VAL;
				break;
			}

			if (tmp_uint32 != (uint32_t)NO_VAL) {
				bool per_cpu = false;
				if (tmp_uint32 & MEM_PER_CPU) {
					tmp_uint32 &= (~MEM_PER_CPU);
					per_cpu = true;
				}
				_local_convert_num_unit((double)tmp_uint32,
							outbuf, sizeof(outbuf),
							UNIT_MEGA);
				if (per_cpu)
					sprintf(outbuf+strlen(outbuf), "c");
				else
					sprintf(outbuf+strlen(outbuf), "n");
			}
			field->print_routine(field,
					     outbuf,
					     (curr_inx == field_count));
			break;
		case PRINT_RESERVATION:
			switch(type) {
			case JOB:
				if (job->resv_name) {
					tmp_char = job->resv_name;
				} else {
					tmp_char = NULL;
				}
				break;
			case JOBSTEP:
				tmp_char = NULL;
				break;
			case JOBCOMP:
				tmp_char = NULL;
				break;
			default:
				tmp_char = NULL;
				break;
			}
			field->print_routine(field,
						tmp_char,
						(curr_inx == field_count));
			break;
		case PRINT_RESERVATION_ID:
			switch(type) {
			case JOB:
				if (job->resvid)
					tmp_uint32 = job->resvid;
				else
					tmp_uint32 = NO_VAL;
				break;
			case JOBSTEP:
				tmp_uint32 = NO_VAL;
				break;
			case JOBCOMP:
				tmp_uint32 = NO_VAL;
				break;
			default:
				tmp_uint32 = NO_VAL;
				break;
			}
			if (tmp_uint32 == (uint32_t)NO_VAL)
				tmp_uint32 = NO_VAL;
			field->print_routine(field,
						tmp_uint32,
						(curr_inx == field_count));
			break;
		case PRINT_RESV:
			switch(type) {
			case JOB:
				if (job->start)
					tmp_int = job->start - job->eligible;
				else
					tmp_int = time(NULL) - job->eligible;
				break;
			case JOBSTEP:
				break;
			case JOBCOMP:

				break;
			default:

				break;
			}
			field->print_routine(field,
					     (uint64_t)tmp_int,
					     (curr_inx == field_count));
			break;
		case PRINT_RESV_CPU:
			switch(type) {
			case JOB:
				if (job->start)
					tmp_int = (job->start - job->eligible)
						* job->req_cpus;
				else
					tmp_int = (time(NULL) - job->eligible)
						* job->req_cpus;
				break;
			case JOBSTEP:
				break;
			case JOBCOMP:

				break;
			default:

				break;
			}
			field->print_routine(field,
					     (uint64_t)tmp_int,
					     (curr_inx == field_count));
			break;
		case PRINT_RESV_CPU_RAW:
			switch(type) {
			case JOB:
				if (job->start)
					tmp_int = (job->start - job->eligible)
						* job->req_cpus;
				else
					tmp_int = (time(NULL) - job->eligible)
						* job->req_cpus;
				break;
			case JOBSTEP:
				break;
			case JOBCOMP:

				break;
			default:

				break;
			}
			field->print_routine(field,
					     tmp_int,
					     (curr_inx == field_count));
			break;
		case PRINT_START:
			switch(type) {
			case JOB:
				tmp_int = job->start;
				break;
			case JOBSTEP:
				tmp_int = step->start;
				break;
			case JOBCOMP:
				tmp_int = parse_time(job_comp->start_time, 1);
				break;
			default:

				break;
			}
			field->print_routine(field,
					     tmp_int,
					     (curr_inx == field_count));
			break;
		case PRINT_STATE:
			switch(type) {
			case JOB:
				tmp_int = job->state;
				tmp_int2 = job->requid;
				break;
			case JOBSTEP:
				tmp_int = step->state;
				tmp_int2 = step->requid;
				break;
			case JOBCOMP:
				tmp_char = job_comp->state;
				break;
			default:

				break;
			}

			if (((tmp_int & JOB_STATE_BASE) == JOB_CANCELLED) &&
			    (tmp_int2 != -1))
				snprintf(outbuf, FORMAT_STRING_SIZE,
					 "%s by %d",
					 job_state_string(tmp_int),
					 tmp_int2);
			else if (tmp_int != NO_VAL)
				snprintf(outbuf, FORMAT_STRING_SIZE,
					 "%s",
					 job_state_string(tmp_int));
			else if (tmp_char)
				snprintf(outbuf, FORMAT_STRING_SIZE,
					 "%s",
					 tmp_char);

			field->print_routine(field,
					     outbuf,
					     (curr_inx == field_count));
			break;
		case PRINT_SUBMIT:
			switch(type) {
			case JOB:
				tmp_int = job->submit;
				break;
			case JOBSTEP:
				tmp_int = step->start;
				break;
			case JOBCOMP:
				tmp_int = parse_time(job_comp->start_time, 1);
				break;
			default:

				break;
			}
			field->print_routine(field,
					     tmp_int,
					     (curr_inx == field_count));
			break;
		case PRINT_SUSPENDED:
			switch(type) {
			case JOB:
				tmp_int = job->suspended;
				break;
			case JOBSTEP:
				tmp_int = step->suspended;
				break;
			case JOBCOMP:

				break;
			default:

				break;
			}
			field->print_routine(field,
					     (uint64_t)tmp_int,
					     (curr_inx == field_count));
			break;
		case PRINT_SYSTEMCPU:
			if (got_stats) {
				switch(type) {
				case JOB:
					tmp_int = job->sys_cpu_sec;
					tmp_int2 = job->sys_cpu_usec;
					break;
				case JOBSTEP:
					tmp_int = step->sys_cpu_sec;
					tmp_int2 = step->sys_cpu_usec;
					break;
				case JOBCOMP:
				default:
					break;
				}
				tmp_char = _elapsed_time(tmp_int, tmp_int2);
			}

			field->print_routine(field,
					     tmp_char,
					     (curr_inx == field_count));
			xfree(tmp_char);
			break;
		case PRINT_TIMELIMIT:
			switch(type) {
			case JOB:
				if (job->timelimit == INFINITE)
					tmp_char = "UNLIMITED";
				else if (job->timelimit == NO_VAL)
					tmp_char = "Partition_Limit";
				else if (job->timelimit) {
					char tmp1[128];
					mins2time_str(job->timelimit,
						      tmp1, sizeof(tmp1));
					tmp_char = tmp1;
				}
				break;
			case JOBSTEP:
				break;
			case JOBCOMP:
				tmp_char = job_comp->timelimit;
				break;
			default:

				break;
			}
			field->print_routine(field,
					     tmp_char,
					     (curr_inx == field_count));
			break;
		case PRINT_TOTALCPU:
			switch(type) {
			case JOB:
				tmp_int = job->tot_cpu_sec;
				tmp_int2 = job->tot_cpu_usec;
				break;
			case JOBSTEP:
				tmp_int = step->tot_cpu_sec;
				tmp_int2 = step->tot_cpu_usec;
				break;
			case JOBCOMP:

				break;
			default:

				break;
			}
			tmp_char = _elapsed_time(tmp_int, tmp_int2);

			field->print_routine(field,
					     tmp_char,
					     (curr_inx == field_count));
			xfree(tmp_char);
			break;
		case PRINT_TRES:
			switch(type) {
			case JOB:
				tmp_char = job->tres_alloc_str;
				break;
			case JOBSTEP:
				tmp_char = step->tres_alloc_str;
				break;
			case JOBCOMP:
			default:
				tmp_char = NULL;
				break;
			}

			if (!g_tres_list) {
				slurmdb_tres_cond_t tres_cond;
				memset(&tres_cond, 0,
				       sizeof(slurmdb_tres_cond_t));
				tres_cond.with_deleted = 1;
				g_tres_list = slurmdb_tres_get(
					acct_db_conn, &tres_cond);
			}

			tmp_char = slurmdb_make_tres_string_from_simple(
				tmp_char, g_tres_list);

			field->print_routine(field,
					     tmp_char,
					     (curr_inx == field_count));
			xfree(tmp_char);
			break;
		case PRINT_UID:
			switch(type) {
			case JOB:
				if (job->user) {
					if ((pw=getpwnam(job->user)))
						tmp_int = pw->pw_uid;
				} else
					tmp_int = job->uid;
				break;
			case JOBSTEP:
				break;
			case JOBCOMP:
				tmp_int = job_comp->uid;
				break;
			default:

				break;
			}

			field->print_routine(field,
					     tmp_int,
					     (curr_inx == field_count));
			break;
		case PRINT_USER:
			switch(type) {
			case JOB:
				if (job->user)
					tmp_char = job->user;
				else if (job->uid != -1) {
					if ((pw=getpwuid(job->uid)))
						tmp_char = pw->pw_name;
				}
				break;
			case JOBSTEP:

				break;
			case JOBCOMP:
				tmp_char = job_comp->uid_name;
				break;
			default:

				break;
			}

			field->print_routine(field,
					     tmp_char,
					     (curr_inx == field_count));
			break;
		case PRINT_USERCPU:
			if (got_stats) {
				switch(type) {
				case JOB:
					tmp_int = job->user_cpu_sec;
					tmp_int2 = job->user_cpu_usec;
					break;
				case JOBSTEP:
					tmp_int = step->user_cpu_sec;
					tmp_int2 = step->user_cpu_usec;
					break;
				case JOBCOMP:
				default:
					break;
				}
				tmp_char = _elapsed_time(tmp_int, tmp_int2);
			}

			field->print_routine(field,
					     tmp_char,
					     (curr_inx == field_count));
			xfree(tmp_char);
			break;
		case PRINT_WCKEY:
			switch(type) {
			case JOB:
				tmp_char = job->wckey;
				break;
			case JOBSTEP:

				break;
			case JOBCOMP:

				break;
			default:

				break;
			}
			field->print_routine(field,
					     tmp_char,
					     (curr_inx == field_count));
			break;
		case PRINT_WCKEYID:
			switch(type) {
			case JOB:
				tmp_int = job->wckeyid;
				break;
			case JOBSTEP:

				break;
			case JOBCOMP:

				break;
			default:

				break;
			}
			field->print_routine(field,
					     tmp_int,
					     (curr_inx == field_count));
			break;
		default:
			break;
		}
		curr_inx++;
	}
	printf("\n");
}
예제 #23
0
파일: allocate.c 프로젝트: jabl/slurm
/*
 * Read a SLURM hostfile specified by "filename".  "filename" must contain
 * a list of SLURM NodeNames, one per line.  Reads up to "n" number of hostnames
 * from the file. Returns a string representing a hostlist ranged string of
 * the contents of the file.  This is a helper function, it does not
 * contact any SLURM daemons.
 *
 * Returns a string representing the hostlist.  Returns NULL if there are fewer
 * than "n" hostnames in the file, or if an error occurs.  If "n" ==
 * NO_VAL then the entire file is read in
 *
 * Returned string must be freed with free().
 */
char *slurm_read_hostfile(char *filename, int n)
{
	FILE *fp = NULL;
	char in_line[BUFFER_SIZE];	/* input line */
	int i, j;
	int line_size;
	int line_num = 0;
	hostlist_t hostlist = NULL;
	char *nodelist = NULL;
	char *asterisk, *tmp_text, *save_ptr = NULL, *host_name;
	int total_file_len = 0;

	if (filename == NULL || strlen(filename) == 0)
		return NULL;

	if ((fp = fopen(filename, "r")) == NULL) {
		error("slurm_allocate_resources error opening file %s, %m",
		      filename);
		return NULL;
	}

	hostlist = hostlist_create(NULL);
	if (hostlist == NULL) {
		fclose(fp);
		return NULL;
	}

	while (fgets(in_line, BUFFER_SIZE, fp) != NULL) {
		line_num++;
		if (!isalpha(in_line[0]) && !isdigit(in_line[0])) {
			error ("Invalid hostfile %s contents on line %d",
			       filename, line_num);
			fclose (fp);
			hostlist_destroy(hostlist);
			return NULL;
		}

		line_size = strlen(in_line);
		total_file_len += line_size;
		if (line_size == (BUFFER_SIZE - 1)) {
			error ("Line %d, of hostfile %s too long",
			       line_num, filename);
			fclose (fp);
			hostlist_destroy(hostlist);
			return NULL;
		}

		for (i = 0; i < line_size; i++) {
			if (in_line[i] == '\n') {
				in_line[i] = '\0';
				break;
			}
			if (in_line[i] == '\0')
				break;
			if (in_line[i] != '#')
				continue;
			if ((i > 0) && (in_line[i - 1] == '\\')) {
				for (j = i; j < line_size; j++) {
					in_line[j - 1] = in_line[j];
				}
				line_size--;
				continue;
			}
			in_line[i] = '\0';
			break;
		}

		tmp_text = xstrdup(in_line);
		host_name = strtok_r(tmp_text, ",", &save_ptr);
		while (host_name) {
			if ((asterisk = strchr(host_name, '*')) &&
			    (i = atoi(asterisk + 1))) {
				asterisk[0] = '\0';
				for (j = 0; j < i; j++)
					hostlist_push_host(hostlist, host_name);
			} else {
				hostlist_push_host(hostlist, host_name);
			}
			host_name = strtok_r(NULL, ",", &save_ptr);
		}
		xfree(tmp_text);

		if ((n != (int)NO_VAL) && (hostlist_count(hostlist) == n))
			break;
	}
	fclose(fp);

	if (hostlist_count(hostlist) <= 0) {
		error("Hostlist is empty!");
		goto cleanup_hostfile;
	}
	if (hostlist_count(hostlist) < n) {
		error("Too few NodeNames in SLURM Hostfile");
		goto cleanup_hostfile;
	}

	total_file_len += 1024;
	nodelist = (char *)malloc(total_file_len);
	if (!nodelist) {
		error("Nodelist xmalloc failed");
		goto cleanup_hostfile;
	}

	if (hostlist_ranged_string(hostlist, total_file_len, nodelist) == -1) {
		error("Hostlist is too long for the allocate RPC!");
		free(nodelist);
		nodelist = NULL;
		goto cleanup_hostfile;
	}

	debug2("Hostlist from SLURM_HOSTFILE = %s", nodelist);

cleanup_hostfile:
	hostlist_destroy(hostlist);

	return nodelist;
}
예제 #24
0
파일: forward.c 프로젝트: IFCA/slurm
/*
 * start_msg_tree  - logic to begin the forward tree and
 *                   accumulate the return codes from processes getting the
 *                   the forwarded message
 *
 * IN: hl          - hostlist_t   - list of every node to send message to
 * IN: msg         - slurm_msg_t  - message to send.
 * IN: timeout     - int          - how long to wait in milliseconds.
 * RET List 	   - List containing the responses of the childern
 *		     (if any) we forwarded the message to. List
 *		     containing type (ret_data_info_t).
 */
extern List start_msg_tree(hostlist_t hl, slurm_msg_t *msg, int timeout)
{
	int *span = NULL;
	fwd_tree_t *fwd_tree = NULL;
	pthread_mutex_t tree_mutex;
	pthread_cond_t notify;
	int j = 0, count = 0;
	List ret_list = NULL;
	char *name = NULL;
	int thr_count = 0;
	int host_count = 0;

	xassert(hl);
	xassert(msg);

	hostlist_uniq(hl);
	host_count = hostlist_count(hl);

	span = set_span(host_count, 0);

	slurm_mutex_init(&tree_mutex);
	pthread_cond_init(&notify, NULL);

	ret_list = list_create(destroy_data_info);

	while ((name = hostlist_shift(hl))) {
		pthread_attr_t attr_agent;
		pthread_t thread_agent;
		int retries = 0;

		slurm_attr_init(&attr_agent);
		if (pthread_attr_setdetachstate
		    (&attr_agent, PTHREAD_CREATE_DETACHED))
			error("pthread_attr_setdetachstate error %m");

		fwd_tree = xmalloc(sizeof(fwd_tree_t));
		fwd_tree->orig_msg = msg;
		fwd_tree->ret_list = ret_list;
		fwd_tree->timeout = timeout;
		fwd_tree->notify = &notify;
		fwd_tree->p_thr_count = &thr_count;
		fwd_tree->tree_mutex = &tree_mutex;

		if (fwd_tree->timeout <= 0) {
			/* convert secs to msec */
			fwd_tree->timeout  = slurm_get_msg_timeout() * 1000;
		}

		fwd_tree->tree_hl = hostlist_create(name);
		free(name);
		for (j = 0; j < span[thr_count]; j++) {
			name = hostlist_shift(hl);
			if (!name)
				break;
			hostlist_push(fwd_tree->tree_hl, name);
			free(name);
		}

		/*
		 * Lock and increase thread counter, we need that to protect
		 * the start_msg_tree waiting loop that was originally designed
		 * around a "while ((count < host_count))" loop. In case where a
		 * fwd thread was not able to get all the return codes from
		 * children, the waiting loop was deadlocked.
		 */
		slurm_mutex_lock(&tree_mutex);
		thr_count++;
		slurm_mutex_unlock(&tree_mutex);

		while (pthread_create(&thread_agent, &attr_agent,
				      _fwd_tree_thread, (void *)fwd_tree)) {
			error("pthread_create error %m");
			if (++retries > MAX_RETRIES)
				fatal("Can't create pthread");
			sleep(1);	/* sleep and try again */
		}
		slurm_attr_destroy(&attr_agent);

	}
	xfree(span);

	slurm_mutex_lock(&tree_mutex);

	count = list_count(ret_list);
	debug2("Tree head got back %d looking for %d", count, host_count);
	while (thr_count > 0) {
		pthread_cond_wait(&notify, &tree_mutex);
		count = list_count(ret_list);
		debug2("Tree head got back %d", count);
	}
	xassert(count >= host_count);	/* Tree head did not get all responses,
					 * but no more active fwd threads!*/
	slurm_mutex_unlock(&tree_mutex);

	slurm_mutex_destroy(&tree_mutex);
	pthread_cond_destroy(&notify);

	return ret_list;
}
예제 #25
0
파일: node_conf.c 프로젝트: acrognale/slurm
/*
 * build_all_frontend_info - get a array of slurm_conf_frontend_t structures
 *	from the slurm.conf reader, build table, and set values
 * is_slurmd_context: set to true if run from slurmd
 * RET 0 if no error, error code otherwise
 */
extern int build_all_frontend_info (bool is_slurmd_context)
{
	slurm_conf_frontend_t **ptr_array;
#ifdef HAVE_FRONT_END
	slurm_conf_frontend_t *fe_single, *fe_line;
	int i, count, max_rc = SLURM_SUCCESS;
	bool front_end_debug;

	if (slurm_get_debug_flags() & DEBUG_FLAG_FRONT_END)
		front_end_debug = true;
	else
		front_end_debug = false;
	count = slurm_conf_frontend_array(&ptr_array);
	if (count == 0)
		fatal("No FrontendName information available!");

	for (i = 0; i < count; i++) {
		hostlist_t hl_name, hl_addr;
		char *fe_name, *fe_addr;

		fe_line = ptr_array[i];
		hl_name = hostlist_create(fe_line->frontends);
		if (hl_name == NULL)
			fatal("Invalid FrontendName:%s", fe_line->frontends);
		hl_addr = hostlist_create(fe_line->addresses);
		if (hl_addr == NULL)
			fatal("Invalid FrontendAddr:%s", fe_line->addresses);
		if (hostlist_count(hl_name) != hostlist_count(hl_addr)) {
			fatal("Inconsistent node count between "
			      "FrontendName(%s) and FrontendAddr(%s)",
			      fe_line->frontends, fe_line->addresses);
		}
		while ((fe_name = hostlist_shift(hl_name))) {
			fe_addr = hostlist_shift(hl_addr);
			fe_single = xmalloc(sizeof(slurm_conf_frontend_t));
			list_append(front_end_list, fe_single);
			fe_single->frontends = xstrdup(fe_name);
			fe_single->addresses = xstrdup(fe_addr);
			free(fe_name);
			free(fe_addr);
			if (fe_line->allow_groups && fe_line->allow_groups[0]) {
				fe_single->allow_groups =
					xstrdup(fe_line->allow_groups);
			}
			if (fe_line->allow_users && fe_line->allow_users[0]) {
				fe_single->allow_users =
					xstrdup(fe_line->allow_users);
			}
			if (fe_line->deny_groups && fe_line->deny_groups[0]) {
				fe_single->deny_groups =
					xstrdup(fe_line->deny_groups);
			}
			if (fe_line->deny_users && fe_line->deny_users[0]) {
				fe_single->deny_users =
					xstrdup(fe_line->deny_users);
			}
			fe_single->port = fe_line->port;
			if (fe_line->reason && fe_line->reason[0])
				fe_single->reason = xstrdup(fe_line->reason);
			fe_single->node_state = fe_line->node_state;
			if (front_end_debug && !is_slurmd_context)
				_dump_front_end(fe_single);
		}
		hostlist_destroy(hl_addr);
		hostlist_destroy(hl_name);
	}
	return max_rc;
#else
	if (slurm_conf_frontend_array(&ptr_array) != 0)
		fatal("FrontendName information configured!");
	return SLURM_SUCCESS;
#endif
}
예제 #26
0
/*
 * route_p_split_hostlist - logic to split an input hostlist into
 *                           a set of hostlists to forward to.
 *
 * IN: hl        - hostlist_t   - list of every node to send message to
 *                                will be empty on return;
 * OUT: sp_hl    - hostlist_t** - the array of hostlists that will be malloced
 * OUT: count    - int*         - the count of created hostlists
 * RET: SLURM_SUCCESS - int
 *
 * Note: created hostlist will have to be freed independently using
 *       hostlist_destroy by the caller.
 * Note: the hostlist_t array will have to be xfree.
 */
extern int route_p_split_hostlist(hostlist_t hl,
				  hostlist_t** sp_hl,
				  int* count)
{
	int i, j, k, hl_ndx, msg_count, sw_count, lst_count;
	char  *buf;
	bitstr_t *nodes_bitmap = NULL;		/* nodes in message list */
	bitstr_t *fwd_bitmap = NULL;		/* nodes in forward list */

	msg_count = hostlist_count(hl);
	if (switch_record_cnt == 0) {
		/* configs have not already been processed */
		slurm_conf_init(NULL);
		if (init_node_conf()) {
			fatal("ROUTE: Failed to init slurm config");
		}
		if (build_all_nodeline_info(false)) {
			fatal("ROUTE: Failed to build node config");
		}
		rehash_node();

		if (slurm_topo_build_config() != SLURM_SUCCESS) {
			fatal("ROUTE: Failed to build topology config");
		}
	}
	*sp_hl = (hostlist_t*) xmalloc(switch_record_cnt * sizeof(hostlist_t));
	/* create bitmap of nodes to send message too */
	if (hostlist2bitmap (hl, false, &nodes_bitmap) != SLURM_SUCCESS) {
		buf = hostlist_ranged_string_xmalloc(hl);
		fatal("ROUTE: Failed to make bitmap from hostlist=%s.", buf);
	}

	/* Find lowest level switch containing all the nodes in the list */
	j = 0;
	for (i = 0; i <= switch_levels; i++) {
		for (j=0; j<switch_record_cnt; j++) {
			if (switch_record_table[j].level == i) {
				if (bit_super_set(nodes_bitmap,
						  switch_record_table[j].
						  node_bitmap)) {
					/* All nodes in message list are in
					 * this switch */
					break;
				}
			}
		}
		if (j < switch_record_cnt) {
			/* Got here via break after bit_super_set */
			break; // 'j' is our switch
		} /* else, no switches at this level reach all nodes */
	}
	if (i > switch_levels) {
		/* This can only happen if trying to schedule multiple physical
		 * clusters as a single logical cluster under the control of a
		 * single slurmctld daemon, and sending something like a
		 * node_registation request to all nodes.
		 * Revert to default behavior*/
		if (debug_flags & DEBUG_FLAG_ROUTE) {
			buf = hostlist_ranged_string_xmalloc(hl);
			debug("ROUTE: didn't find switch containing nodes=%s",
			      buf);
			xfree(buf);
		}
		FREE_NULL_BITMAP(nodes_bitmap);
		xfree(*sp_hl);
		return route_split_hostlist_treewidth(hl, sp_hl, count);
	}
	if (switch_record_table[j].level == 0) {
		/* This is a leaf switch. Construct list based on TreeWidth */
		FREE_NULL_BITMAP(nodes_bitmap);
		xfree(*sp_hl);
		return route_split_hostlist_treewidth(hl, sp_hl, count);
	}
	/* loop through children, construction a hostlist for each child switch
	 * with nodes in the message list */
	hl_ndx = 0;
	lst_count = 0;
	for (i=0; i < switch_record_table[j].num_switches; i++) {
		k = switch_record_table[j].switch_index[i];
		fwd_bitmap = bit_copy(switch_record_table[k].node_bitmap);
		bit_and(fwd_bitmap, nodes_bitmap);
		sw_count = bit_set_count(fwd_bitmap);
		if (sw_count == 0) {
			continue; /* no nodes on this switch in message list */
		}
		(*sp_hl)[hl_ndx] = bitmap2hostlist(fwd_bitmap);
		/* Now remove nodes from this switch from message list */
		bit_not(fwd_bitmap);
		bit_and(nodes_bitmap, fwd_bitmap);
		FREE_NULL_BITMAP(fwd_bitmap);
		if (debug_flags & DEBUG_FLAG_ROUTE) {
			buf = hostlist_ranged_string_xmalloc((*sp_hl)[hl_ndx]);
			debug("ROUTE: ... sublist[%d] switch=%s :: %s",
			      i, switch_record_table[i].name, buf);
			xfree(buf);
		}
		hl_ndx++;
		lst_count += sw_count;
		if (lst_count == msg_count)
			break; /* all nodes in message are in a child list */
	}
	FREE_NULL_BITMAP(nodes_bitmap);

	*count = hl_ndx;
	return SLURM_SUCCESS;

}
예제 #27
0
파일: as_pg_common.c 프로젝트: IFCA/slurm
/*
 * setup_cluster_nodes - get cluster record list within requested
 *   time period with used nodes. Used for deciding whether a nodelist is
 *   overlapping with the required nodes.
 */
extern cluster_nodes_t *
setup_cluster_nodes(pgsql_conn_t *pg_conn, slurmdb_job_cond_t *job_cond)
{
	DEF_VARS;
	cluster_nodes_t *cnodes = NULL;
	time_t now = time(NULL);
	hostlist_t temp_hl = NULL;
	hostlist_iterator_t h_itr = NULL;

	if (!job_cond || !job_cond->used_nodes)
		return NULL;

	if (!job_cond->cluster_list || list_count(job_cond->cluster_list) != 1) {
		error("If you are doing a query against nodes "
		      "you must only have 1 cluster "
		      "you are asking for.");
		return NULL;
	}

	temp_hl = hostlist_create(job_cond->used_nodes);
	if (!hostlist_count(temp_hl)) {
		error("we didn't get any real hosts to look for.");
		hostlist_destroy(temp_hl);
		return NULL;
	}

	query = xstrdup_printf("SELECT cluster_nodes, time_start, "
			       "time_end FROM %s.%s WHERE node_name='' "
			       "AND cluster_nodes !=''",
			       (char *)list_peek(job_cond->cluster_list),
			       event_table);

	if (job_cond->usage_start) {
		if (!job_cond->usage_end)
			job_cond->usage_end = now;

		xstrfmtcat(query, " AND ((time_start<%ld) "
			   "AND (time_end>=%ld OR time_end=0))",
			   job_cond->usage_end, job_cond->usage_start);
	}

	result = DEF_QUERY_RET;
	if (!result) {
		hostlist_destroy(temp_hl);
		return NULL;
	}

	h_itr = hostlist_iterator_create(temp_hl);
	cnodes = xmalloc(sizeof(cluster_nodes_t));
	cnodes->cluster_list = list_create(_destroy_local_cluster);
	FOR_EACH_ROW {
		char *host = NULL;
		int loc = 0;
		local_cluster_t *local_cluster =
			xmalloc(sizeof(local_cluster_t));
		local_cluster->hl = hostlist_create(ROW(0));
		local_cluster->start = atoi(ROW(1));
		local_cluster->end   = atoi(ROW(2));
		local_cluster->asked_bitmap =
			bit_alloc(hostlist_count(local_cluster->hl));
		while((host = hostlist_next(h_itr))) {
			if ((loc = hostlist_find(
				    local_cluster->hl, host)) != -1)
				bit_set(local_cluster->asked_bitmap, loc);
			free(host);
		}
		hostlist_iterator_reset(h_itr);
		if (bit_ffs(local_cluster->asked_bitmap) != -1) {
			list_append(cnodes->cluster_list, local_cluster);
			if (local_cluster->end == 0) {
				local_cluster->end = now;
				cnodes->curr_cluster = local_cluster;
			}
		} else
			_destroy_local_cluster(local_cluster);
	} END_EACH_ROW;
	PQclear(result);
	hostlist_iterator_destroy(h_itr);
	if (!list_count(cnodes->cluster_list)) {
		destroy_cluster_nodes(cnodes);
		cnodes = NULL;
	}

	hostlist_destroy(temp_hl);
	return cnodes;
}
예제 #28
0
int
pstdout_launch(const char *hostnames, Pstdout_Thread pstdout_func, void *arg)
{
    struct pstdout_thread_data **tdata = NULL;
    struct pstdout_state pstate;
    unsigned int pstate_init = 0;
    hostlist_iterator_t hitr = NULL;
    hostlist_t h = NULL;
    int h_count = 0;
    char *host = NULL;
    int exit_code = -1;
    sighandler_t sighandler_save = NULL;
    int sighandler_set = 0;
    int rc;
    int i;

    if (!pstdout_initialized)
    {
        pstdout_errnum = PSTDOUT_ERR_UNINITIALIZED;
        return -1;
    }

    if (!pstdout_func)
    {
        pstdout_errnum = PSTDOUT_ERR_PARAMETERS;
        return -1;
    }

    if ((rc = pthread_mutex_lock(&pstdout_launch_mutex)))
    {
        if (pstdout_debug_flags & PSTDOUT_DEBUG_STANDARD)
            fprintf(stderr, "pthread_mutex_lock: %s\n", strerror(rc));
        pstdout_errnum = PSTDOUT_ERR_INTERNAL;
        goto cleanup;
    }

    /* Special case */
    if (!hostnames)
    {
        if (_pstdout_state_init(&pstate, NULL) < 0)
            goto cleanup;
        pstate_init++;

        exit_code = pstdout_func(&pstate, NULL, arg);
        pstdout_errnum = PSTDOUT_ERR_SUCCESS;
        goto cleanup;
    }

    if (!(h = hostlist_create(hostnames)))
    {
        pstdout_errnum = PSTDOUT_ERR_OUTMEM;
        goto cleanup;
    }
    h_count = hostlist_count(h);

    /* Sanity check */
    if (h_count <= 0)
    {
        if (pstdout_debug_flags & PSTDOUT_DEBUG_STANDARD)
            fprintf(stderr, "h_count = %d\n", h_count);
        pstdout_errnum = PSTDOUT_ERR_INTERNAL;
        goto cleanup;
    }

    /* Special case */
    if (h_count == 1)
    {
        if (_pstdout_state_init(&pstate, hostnames) < 0)
            goto cleanup;
        pstate_init++;

        exit_code = pstdout_func(&pstate, hostnames, arg);
        pstdout_errnum = PSTDOUT_ERR_SUCCESS;
        goto cleanup;
    }

    if ((sighandler_save = signal(SIGINT, _pstdout_sigint)) == SIG_ERR)
    {
        if (pstdout_debug_flags & PSTDOUT_DEBUG_STANDARD)
            fprintf(stderr, "signal\n");
        pstdout_errnum = PSTDOUT_ERR_INTERNAL;
        goto cleanup;
    }
    sighandler_set++;

    if (!(hitr = hostlist_iterator_create(h)))
    {
        pstdout_errnum = PSTDOUT_ERR_OUTMEM;
        goto cleanup;
    }

    if (!(tdata = (struct pstdout_thread_data **)malloc(sizeof(struct pstdout_thread_data *) * h_count)))
    {
        pstdout_errnum = PSTDOUT_ERR_OUTMEM;
        goto cleanup;
    }
    memset(tdata, '\0', sizeof(struct pstdout_thread_data *) * h_count);

    i = 0;
    while ((host = hostlist_next(hitr)))
    {
        if (!(tdata[i] = (struct pstdout_thread_data *)malloc(sizeof(struct pstdout_thread_data))))
        {
            pstdout_errnum = PSTDOUT_ERR_OUTMEM;
            goto cleanup;
        }
        memset(tdata[i], '\0', sizeof(struct pstdout_thread_data));

        if (!(tdata[i]->hostname = strdup(host)))
        {
            pstdout_errnum = PSTDOUT_ERR_OUTMEM;
            goto cleanup;
        }
        tdata[i]->pstdout_func = pstdout_func;
        tdata[i]->arg = arg;

        if ((rc = pthread_attr_init(&(tdata[i]->attr))))
        {
            if (pstdout_debug_flags & PSTDOUT_DEBUG_STANDARD)
                fprintf(stderr, "pthread_attr_init: %s\n", strerror(rc));
            pstdout_errnum = PSTDOUT_ERR_INTERNAL;
            goto cleanup;
        }

        if ((rc = pthread_attr_setdetachstate(&(tdata[i]->attr), PTHREAD_CREATE_DETACHED)))
        {
            if (pstdout_debug_flags & PSTDOUT_DEBUG_STANDARD)
                fprintf(stderr, "pthread_attr_setdetachstate: %s\n", strerror(rc));
            pstdout_errnum = PSTDOUT_ERR_INTERNAL;
            goto cleanup;
        }

        free(host);
        i++;
    }
    host = NULL;

    hostlist_iterator_destroy(hitr);
    hitr = NULL;

    hostlist_destroy(h);
    h = NULL;

    /* Launch threads up to fanout */
    for (i = 0; i < h_count; i++)
    {
        if ((rc = pthread_mutex_lock(&pstdout_threadcount_mutex)))
        {
            if (pstdout_debug_flags & PSTDOUT_DEBUG_STANDARD)
                fprintf(stderr, "pthread_mutex_lock: %s\n", strerror(rc));
            pstdout_errnum = PSTDOUT_ERR_INTERNAL;
            goto cleanup;
        }

        if (pstdout_threadcount == pstdout_fanout)
        {
            if ((rc = pthread_cond_wait(&pstdout_threadcount_cond, &pstdout_threadcount_mutex)))
            {
                if (pstdout_debug_flags & PSTDOUT_DEBUG_STANDARD)
                    fprintf(stderr, "pthread_cond_wait: %s\n", strerror(rc));
                pstdout_errnum = PSTDOUT_ERR_INTERNAL;
                goto cleanup;
            }
        }

        if ((rc = pthread_create(&(tdata[i]->tid),
                                 &(tdata[i]->attr),
                                 _pstdout_func_entry,
                                 (void *) tdata[i])))
        {
            if (pstdout_debug_flags & PSTDOUT_DEBUG_STANDARD)
                fprintf(stderr, "pthread_create: %s\n", strerror(rc));
            pstdout_errnum = PSTDOUT_ERR_INTERNAL;
            goto cleanup;
        }

        pstdout_threadcount++;

        if ((rc = pthread_mutex_unlock(&pstdout_threadcount_mutex)))
        {
            if (pstdout_debug_flags & PSTDOUT_DEBUG_STANDARD)
                fprintf(stderr, "pthread_mutex_unlock: %s\n", strerror(rc));
            pstdout_errnum = PSTDOUT_ERR_INTERNAL;
            goto cleanup;
        }
    }

    /* Wait for Threads to finish */

    if ((rc = pthread_mutex_lock(&pstdout_threadcount_mutex)))
    {
        if (pstdout_debug_flags & PSTDOUT_DEBUG_STANDARD)
            fprintf(stderr, "pthread_mutex_lock: %s\n", strerror(rc));
        pstdout_errnum = PSTDOUT_ERR_INTERNAL;
        goto cleanup;
    }

    while (pstdout_threadcount > 0)
    {
        if ((rc = pthread_cond_wait(&pstdout_threadcount_cond, &pstdout_threadcount_mutex)))
        {
            if (pstdout_debug_flags & PSTDOUT_DEBUG_STANDARD)
                fprintf(stderr, "pthread_cond_wait: %s\n", strerror(rc));
            pstdout_errnum = PSTDOUT_ERR_INTERNAL;
            goto cleanup;
        }
    }

    if (_pstdout_output_consolidated_finish() < 0)
        goto cleanup;

    /* Determine exit code */
    exit_code = 0;
    for (i = 0; i < h_count; i++)
    {
        if (tdata[i]->exit_code > exit_code)
            exit_code = tdata[i]->exit_code;
    }

cleanup:
    /* Cannot pass NULL for key, so just pass dummy key */
    list_delete_all(pstdout_consolidated_stdout, _pstdout_consolidated_data_delete_all, "");
    list_delete_all(pstdout_consolidated_stderr, _pstdout_consolidated_data_delete_all, "");
    if (pstate_init)
        _pstdout_state_cleanup(&pstate);
    if (tdata)
    {
        for (i = 0; i < h_count; i++)
        {
            if (tdata[i])
            {
                free(tdata[i]->hostname);
                pthread_attr_destroy(&(tdata[i]->attr));
                free(tdata[i]);
            }
        }
        free(tdata);
    }
    if (hitr)
        hostlist_iterator_destroy(hitr);
    if (h)
        hostlist_destroy(h);
    free(host);
    if ((rc = pthread_mutex_unlock(&pstdout_launch_mutex)))
    {
        if (pstdout_debug_flags & PSTDOUT_DEBUG_STANDARD)
            fprintf(stderr, "pthread_mutex_unlock: %s\n", strerror(rc));
        /* Don't change error code, just move on */
    }
    if (sighandler_set)
        signal(SIGINT, sighandler_save);
    return exit_code;
}
예제 #29
0
파일: node_conf.c 프로젝트: IFCA/slurm
/*
 * _build_single_nodeline_info - From the slurm.conf reader, build table,
 * 	and set values
 * RET 0 if no error, error code otherwise
 * Note: Operates on common variables
 *	default_node_record - default node configuration values
 */
static int _build_single_nodeline_info(slurm_conf_node_t *node_ptr,
				       struct config_record *config_ptr)
{
	int error_code = SLURM_SUCCESS;
	struct node_record *node_rec = NULL;
	hostlist_t address_list = NULL;
	hostlist_t alias_list = NULL;
	hostlist_t hostname_list = NULL;
	hostlist_t port_list = NULL;
	char *address = NULL;
	char *alias = NULL;
	char *hostname = NULL;
	char *port_str = NULL;
	int state_val = NODE_STATE_UNKNOWN;
	int address_count, alias_count, hostname_count, port_count;
	uint16_t port = 0;

	if (node_ptr->state != NULL) {
		state_val = state_str2int(node_ptr->state, node_ptr->nodenames);
		if (state_val == NO_VAL)
			goto cleanup;
	}

	if ((address_list = hostlist_create(node_ptr->addresses)) == NULL) {
		fatal("Unable to create NodeAddr list from %s",
		      node_ptr->addresses);
		error_code = errno;
		goto cleanup;
	}
	if ((alias_list = hostlist_create(node_ptr->nodenames)) == NULL) {
		fatal("Unable to create NodeName list from %s",
		      node_ptr->nodenames);
		error_code = errno;
		goto cleanup;
	}
	if ((hostname_list = hostlist_create(node_ptr->hostnames)) == NULL) {
		fatal("Unable to create NodeHostname list from %s",
		      node_ptr->hostnames);
		error_code = errno;
		goto cleanup;
	}
	if (node_ptr->port_str && node_ptr->port_str[0] &&
	    (node_ptr->port_str[0] != '[') &&
	    (strchr(node_ptr->port_str, '-') ||
	     strchr(node_ptr->port_str, ','))) {
		xstrfmtcat(port_str, "[%s]", node_ptr->port_str);
		port_list = hostlist_create(port_str);
		xfree(port_str);
	} else {
		port_list = hostlist_create(node_ptr->port_str);
	}
	if (port_list == NULL) {
		error("Unable to create Port list from %s",
		      node_ptr->port_str);
		error_code = errno;
		goto cleanup;
	}

	/* some sanity checks */
	address_count  = hostlist_count(address_list);
	alias_count    = hostlist_count(alias_list);
	hostname_count = hostlist_count(hostname_list);
	port_count     = hostlist_count(port_list);
#ifdef HAVE_FRONT_END
	if ((hostname_count != alias_count) && (hostname_count != 1)) {
		error("NodeHostname count must equal that of NodeName "
		      "records of there must be no more than one");
		goto cleanup;
	}
	if ((address_count != alias_count) && (address_count != 1)) {
		error("NodeAddr count must equal that of NodeName "
		      "records of there must be no more than one");
		goto cleanup;
	}
#else
#ifdef MULTIPLE_SLURMD
	if ((address_count != alias_count) && (address_count != 1)) {
		error("NodeAddr count must equal that of NodeName "
		      "records of there must be no more than one");
		goto cleanup;
	}
#else
	if (address_count < alias_count) {
		error("At least as many NodeAddr are required as NodeName");
		goto cleanup;
	}
	if (hostname_count < alias_count) {
		error("At least as many NodeHostname are required "
		      "as NodeName");
		goto cleanup;
	}
#endif	/* MULTIPLE_SLURMD */
#endif	/* HAVE_FRONT_END */
	if ((port_count != alias_count) && (port_count > 1)) {
		error("Port count must equal that of NodeName "
		      "records or there must be no more than one");
		goto cleanup;
	}

	/* now build the individual node structures */
	while ((alias = hostlist_shift(alias_list))) {
		if (address_count > 0) {
			address_count--;
			if (address)
				free(address);
			address = hostlist_shift(address_list);
		}
		if (hostname_count > 0) {
			hostname_count--;
			if (hostname)
				free(hostname);
			hostname = hostlist_shift(hostname_list);
		}
		if (port_count > 0) {
			int port_int;
			port_count--;
			if (port_str)
				free(port_str);
			port_str = hostlist_shift(port_list);
			port_int = atoi(port_str);
			if ((port_int <= 0) || (port_int > 0xffff))
				fatal("Invalid Port %s", node_ptr->port_str);
			port = port_int;
		}
		/* find_node_record locks this to get the
		 * alias so we need to unlock */
		node_rec = find_node_record(alias);

		if (node_rec == NULL) {
			node_rec = create_node_record(config_ptr, alias);
			if ((state_val != NO_VAL) &&
			    (state_val != NODE_STATE_UNKNOWN))
				node_rec->node_state = state_val;
			node_rec->last_response = (time_t) 0;
			node_rec->comm_name = xstrdup(address);
			node_rec->node_hostname = xstrdup(hostname);
			node_rec->port      = port;
			node_rec->weight    = node_ptr->weight;
			node_rec->features  = xstrdup(node_ptr->feature);
			node_rec->reason    = xstrdup(node_ptr->reason);
		} else {
			/* FIXME - maybe should be fatal? */
			error("Reconfiguration for node %s, ignoring!", alias);
		}
		free(alias);
	}

	/* free allocated storage */
cleanup:
	if (address)
		free(address);
	if (hostname)
		free(hostname);
	if (port_str)
		free(port_str);
	if (address_list)
		hostlist_destroy(address_list);
	if (alias_list)
		hostlist_destroy(alias_list);
	if (hostname_list)
		hostlist_destroy(hostname_list);
	if (port_list)
		hostlist_destroy(port_list);
	return error_code;
}
예제 #30
0
파일: pmixp_coll.c 프로젝트: artpol84/slurm
static void _progress_fan_in(pmixp_coll_t *coll)
{
	pmixp_srv_cmd_t type;
	const char *addr = pmixp_info_srv_addr();
	char *hostlist = NULL;
	int rc, is_p2p = 0;
	Buf root_buf;

	PMIXP_DEBUG("%s:%d: start, local=%d, child_cntr=%d",
			pmixp_info_namespace(), pmixp_info_nodeid(),
			coll->contrib_local, coll->contrib_cntr);

	/* lock the collective */
	slurm_mutex_lock(&coll->lock);

	pmixp_coll_sanity_check(coll);

	if (PMIXP_COLL_FAN_IN != coll->state) {
		/* In case of race condition between libpmix and
		 * slurm threads progress_fan_in can be called
		 * after we moved to the next step. */
		goto unlock;
	}

	if (!coll->contrib_local || coll->contrib_cntr != coll->children_cnt) {
		/* Not yet ready to go to the next step */
		goto unlock;
	}

	/* The root of the collective will have parent_host == NULL */
	if (NULL != coll->parent_host) {
		hostlist = xstrdup(coll->parent_host);
		type = PMIXP_MSG_FAN_IN;
		PMIXP_DEBUG("%s:%d: switch to PMIXP_COLL_FAN_OUT state",
			    pmixp_info_namespace(), pmixp_info_nodeid());
		is_p2p = 1;
	} else {
		if (0 < hostlist_count(coll->all_children)) {
			hostlist = hostlist_ranged_string_xmalloc(
					coll->all_children);
			type = PMIXP_MSG_FAN_OUT;
			pmixp_debug_hang(0);
		}
		rc = _copy_payload(coll->buf, coll->serv_offs, &root_buf);
		xassert(0 == rc);
		PMIXP_DEBUG("%s:%d: finish with this collective (I am the root)",
			    pmixp_info_namespace(), pmixp_info_nodeid());
	}

	PMIXP_DEBUG("%s:%d: send data to %s", pmixp_info_namespace(),
			pmixp_info_nodeid(), hostlist);

	/* Check for the singletone case */
	if (NULL != hostlist) {
		if( 0 == coll->seq && NULL != coll->parent_host ){
			/* This is the first message sent to the parent.
			 * There might be a race condition where parent
			 * is not ready to receive the messages.
			 * Use zero-size message to check parent status first
			 * and then send the full message.
			 */
			pmixp_server_health_chk(hostlist, addr);
		}
		rc = pmixp_server_send(hostlist, type, coll->seq, addr,
				get_buf_data(coll->buf),
				get_buf_offset(coll->buf), is_p2p);

		if (SLURM_SUCCESS != rc) {
			PMIXP_ERROR(
					"Cannot send data (size = %lu), to hostlist:\n%s",
					(uint64_t) get_buf_offset(coll->buf),
					hostlist);
			/* return error indication to PMIx. Nodes that haven't received data
			 * will exit by a timeout.
			 * FIXME: do we need to do something with successfuly finished nodes?
			 */
			goto unlock;
		}
	}

	/* transit to the next state */
	_fan_in_finished(coll);

	/* if we are root - push data to PMIx here.
	 * Originally there was a homogenuous solution: root nodename was in the hostlist.
	 * However this may lead to the undesired side effects: we are blocked here sending
	 * data and cannot receive (it will be triggered in this thread after we will leave
	 * this callback), so we have to rely on buffering on the SLURM side.
	 * Better not to do so. */
	if (NULL == coll->parent_host) {
		/* if I am the root - pass the data to PMIx and reset collective here */
		/* copy payload excluding reserved server header */
		_progres_fan_out(coll, root_buf);
	}

unlock:
	if (NULL != hostlist) {
		xfree(hostlist);
	}

	/* lock the */
	slurm_mutex_unlock(&coll->lock);
}