Ejemplo n.º 1
0
/*
 * check_header_version checks to see that the specified header was sent
 * from a node running the same version of the protocol as the current node
 * IN header - the message header received
 * RET - SLURM error code
 */
int check_header_version(header_t * header)
{
	uint16_t check_version = SLURM_PROTOCOL_VERSION;

	if (working_cluster_rec)
		check_version = working_cluster_rec->rpc_version;

	if (slurmdbd_conf) {
		if ((header->version != SLURM_PROTOCOL_VERSION)     &&
		    (header->version != SLURM_ONE_BACK_PROTOCOL_VERSION) &&
		    (header->version != SLURM_MIN_PROTOCOL_VERSION)) {
			debug("unsupported RPC version %hu msg type %s(%u)",
			      header->version, rpc_num2string(header->msg_type),
			      header->msg_type);
			slurm_seterrno_ret(SLURM_PROTOCOL_VERSION_ERROR);
		}
	} else if (header->version != check_version) {
		switch (header->msg_type) {
		case REQUEST_LAUNCH_TASKS:
		case REQUEST_RUN_JOB_STEP:
		case RESPONSE_LAUNCH_TASKS:
		case RESPONSE_RUN_JOB_STEP:
			if (working_cluster_rec) {
				/* Disable job step creation/launch
				 * between major releases. Other RPCs
				 * should all be supported. */
				debug("unsupported RPC type %hu",
				      header->msg_type);
				slurm_seterrno_ret(
					SLURM_PROTOCOL_VERSION_ERROR);
				break;
			}
		default:
			if ((header->version != SLURM_PROTOCOL_VERSION)     &&
			    (header->version !=
			     SLURM_ONE_BACK_PROTOCOL_VERSION) &&
			    (header->version != SLURM_MIN_PROTOCOL_VERSION)) {
				debug("Unsupported RPC version %hu "
				      "msg type %s(%u)", header->version,
				      rpc_num2string(header->msg_type),
				      header->msg_type);
				slurm_seterrno_ret(
					SLURM_PROTOCOL_VERSION_ERROR);
			}
			break;

		}
	}

	return SLURM_PROTOCOL_SUCCESS;
}
Ejemplo n.º 2
0
static int _print_stats(void)
{
    int i;

    if (!buf) {
        printf("No data available. Probably slurmctld is not working\n");
        return -1;
    }

    printf("*******************************************************\n");
    printf("sdiag output at %s", slurm_ctime(&buf->req_time));
    printf("Data since      %s", slurm_ctime(&buf->req_time_start));
    printf("*******************************************************\n");

    printf("Server thread count: %d\n", buf->server_thread_count);
    printf("Agent queue size:    %d\n\n", buf->agent_queue_size);
    printf("Jobs submitted: %d\n", buf->jobs_submitted);
    printf("Jobs started:   %d\n", buf->jobs_started);
    printf("Jobs completed: %d\n", buf->jobs_completed);
    printf("Jobs canceled:  %d\n", buf->jobs_canceled);
    printf("Jobs failed:    %d\n", buf->jobs_failed);
    printf("\nMain schedule statistics (microseconds):\n");
    printf("\tLast cycle:   %u\n", buf->schedule_cycle_last);
    printf("\tMax cycle:    %u\n", buf->schedule_cycle_max);
    printf("\tTotal cycles: %u\n", buf->schedule_cycle_counter);
    if (buf->schedule_cycle_counter > 0) {
        printf("\tMean cycle:   %u\n",
               buf->schedule_cycle_sum / buf->schedule_cycle_counter);
        printf("\tMean depth cycle:  %u\n",
               buf->schedule_cycle_depth / buf->schedule_cycle_counter);
    }
    if ((buf->req_time - buf->req_time_start) > 60) {
        printf("\tCycles per minute: %u\n",
               (uint32_t) (buf->schedule_cycle_counter /
                           ((buf->req_time - buf->req_time_start) / 60)));
    }
    printf("\tLast queue length: %u\n", buf->schedule_queue_len);

    if (buf->bf_active) {
        printf("\nBackfilling stats (WARNING: data obtained"
               " in the middle of backfilling execution.)\n");
    } else
        printf("\nBackfilling stats\n");

    printf("\tTotal backfilled jobs (since last slurm start): %u\n",
           buf->bf_backfilled_jobs);
    printf("\tTotal backfilled jobs (since last stats cycle start): %u\n",
           buf->bf_last_backfilled_jobs);
    printf("\tTotal cycles: %u\n", buf->bf_cycle_counter);
    printf("\tLast cycle when: %s", slurm_ctime(&buf->bf_when_last_cycle));
    printf("\tLast cycle: %u\n", buf->bf_cycle_last);
    printf("\tMax cycle:  %u\n", buf->bf_cycle_max);
    if (buf->bf_cycle_counter > 0) {
        printf("\tMean cycle: %"PRIu64"\n",
               buf->bf_cycle_sum / buf->bf_cycle_counter);
    }
    printf("\tLast depth cycle: %u\n", buf->bf_last_depth);
    printf("\tLast depth cycle (try sched): %u\n", buf->bf_last_depth_try);
    if (buf->bf_cycle_counter > 0) {
        printf("\tDepth Mean: %u\n",
               buf->bf_depth_sum / buf->bf_cycle_counter);
        printf("\tDepth Mean (try depth): %u\n",
               buf->bf_depth_try_sum / buf->bf_cycle_counter);
    }
    printf("\tLast queue length: %u\n", buf->bf_queue_len);
    if (buf->bf_cycle_counter > 0) {
        printf("\tQueue length mean: %u\n",
               buf->bf_queue_len_sum / buf->bf_cycle_counter);
    }

    printf("\nRemote Procedure Call statistics by message type\n");
    for (i = 0; i < buf->rpc_type_size; i++) {
        printf("\t%-40s(%5u) count:%-6u "
               "ave_time:%-6u total_time:%"PRIu64"\n",
               rpc_num2string(buf->rpc_type_id[i]),
               buf->rpc_type_id[i], buf->rpc_type_cnt[i],
               rpc_type_ave_time[i], buf->rpc_type_time[i]);
    }

    printf("\nRemote Procedure Call statistics by user\n");
    for (i = 0; i < buf->rpc_user_size; i++) {
        printf("\t%-16s(%8u) count:%-6u "
               "ave_time:%-6u total_time:%"PRIu64"\n",
               uid_to_string_cached((uid_t)buf->rpc_user_id[i]),
               buf->rpc_user_id[i], buf->rpc_user_cnt[i],
               rpc_user_ave_time[i], buf->rpc_user_time[i]);
    }

    return 0;
}
Ejemplo n.º 3
0
/* Send an RPC to the SlurmDBD and wait for an arbitrary reply message.
 * The RPC will not be queued if an error occurs.
 * The "resp" message must be freed by the caller.
 * Returns SLURM_SUCCESS or an error code */
extern int send_recv_slurmdbd_msg(uint16_t rpc_version,
				  slurmdbd_msg_t *req,
				  slurmdbd_msg_t *resp)
{
	int rc = SLURM_SUCCESS;
	Buf buffer;

	xassert(req);
	xassert(resp);

	/* To make sure we can get this to send instead of the agent
	   sending stuff that can happen anytime we set halt_agent and
	   then after we get into the mutex we unset.
	*/
	halt_agent = 1;
	slurm_mutex_lock(&slurmdbd_lock);
	halt_agent = 0;
	if (!slurmdbd_conn || (slurmdbd_conn->fd < 0)) {
		/* Either slurm_open_slurmdbd_conn() was not executed or
		 * the connection to Slurm DBD has been closed */
		if (req->msg_type == DBD_GET_CONFIG)
			_open_slurmdbd_conn(0);
		else
			_open_slurmdbd_conn(1);
		if (!slurmdbd_conn || (slurmdbd_conn->fd < 0)) {
			rc = SLURM_ERROR;
			goto end_it;
		}
	}

	if (!(buffer = pack_slurmdbd_msg(req, rpc_version))) {
		rc = SLURM_ERROR;
		goto end_it;
	}

	rc = slurm_persist_send_msg(slurmdbd_conn, buffer);
	free_buf(buffer);
	if (rc != SLURM_SUCCESS) {
		error("slurmdbd: Sending message type %s: %d: %m",
		      rpc_num2string(req->msg_type), rc);
		goto end_it;
	}

	buffer = slurm_persist_recv_msg(slurmdbd_conn);
	if (buffer == NULL) {
		error("slurmdbd: Getting response to message type %u",
		      req->msg_type);
		rc = SLURM_ERROR;
		goto end_it;
	}

	rc = unpack_slurmdbd_msg(resp, rpc_version, buffer);
	/* check for the rc of the start job message */
	if (rc == SLURM_SUCCESS && resp->msg_type == DBD_ID_RC)
		rc = ((dbd_id_rc_msg_t *)resp->data)->return_code;

	free_buf(buffer);
end_it:
	slurm_cond_signal(&slurmdbd_cond);
	slurm_mutex_unlock(&slurmdbd_lock);

	return rc;
}