예제 #1
0
파일: pmixp_utils.c 프로젝트: A1ve5/slurm
int pmixp_stepd_send(char *nodelist, const char *address, char *data,
		     uint32_t len, unsigned int start_delay,
		     unsigned int retry_cnt, int silent)
{

	int retry = 0, rc;
	unsigned int delay = start_delay; /* in milliseconds */
	char *copy_of_nodelist = xstrdup(nodelist);

	while (1) {
		if (!silent && retry >= 1) {
			PMIXP_ERROR("send failed, rc=%d, try #%d", rc, retry);
		}

		rc = slurm_forward_data(
			&copy_of_nodelist, (char *)address, len, data);

		if (rc == SLURM_SUCCESS)
			break;

		retry++;
		if (retry >= retry_cnt)
			break;

		/* wait with constantly increasing delay */
		struct timespec ts =
			{(delay / 1000), ((delay % 1000) * 1000000)};
		nanosleep(&ts, NULL);
		delay *= 2;
	}
	xfree(copy_of_nodelist);

	return rc;
}
예제 #2
0
파일: kvs.c 프로젝트: SchedMD/slurm
extern int
temp_kvs_send(void)
{
	int rc = SLURM_ERROR, retry = 0;
	unsigned int delay = 1;
	char *nodelist = NULL;

	if (!in_stepd())	/* srun */
		nodelist = xstrdup(job_info.step_nodelist);
	else if (tree_info.parent_node)
		nodelist = xstrdup(tree_info.parent_node);

	/* cmd included in temp_kvs_buf */
	kvs_seq++; /* expecting new kvs after now */

	while (1) {
		if (retry == 1)
			verbose("failed to send temp kvs, rc=%d, retrying", rc);

		if (nodelist)
			/* srun or non-first-level stepds */
			rc = slurm_forward_data(&nodelist,
						tree_sock_addr,
						temp_kvs_cnt,
						temp_kvs_buf);
		else		/* first level stepds */
			rc = tree_msg_to_srun(temp_kvs_cnt, temp_kvs_buf);

		if (rc == SLURM_SUCCESS)
			break;

		if (++retry >= MAX_RETRIES)
			break;
		/* wait, in case parent stepd / srun not ready */
		sleep(delay);
		delay *= 2;
	}
	temp_kvs_init();	/* clear old temp kvs */

	xfree(nodelist);

	return rc;
}
예제 #3
0
파일: ring.c 프로젝트: A1ve5/slurm
/* send message defined by buf and size to given rank stepd */
static int pmix_stepd_send(const char* buf, uint32_t size, int rank)
{
	int rc = SLURM_SUCCESS;

	/* map rank to host name */
	char* host = hostlist_nth(pmix_stepd_hostlist, rank); /* strdup-ed */

	/* delay to sleep between retries in seconds,
	 * if there are multiple retires, we'll grow this delay
          * using exponential backoff, doubling it each time */
	unsigned int delay = 1;

	/* we'll try multiple times to send message to stepd,
	 * we retry in case stepd is just slow to get started */
	int retries = 0;
	while (1) {
		/* attempt to send message */
		rc = slurm_forward_data(&host, tree_sock_addr, size, buf);
		if (rc == SLURM_SUCCESS) {
			/* message sent successfully, we're done */
			break;
		}

		/* check whether we've exceeded our retry count */
		retries++;
		if (retries >= MAX_RETRIES) {
			/* cancel the step to avoid tasks hang */
			slurm_kill_job_step(job_info.jobid, job_info.stepid,
					    SIGKILL);
		}

		/* didn't succeeded, but we'll retry again,
		 * sleep for a bit first */
		sleep(delay);
		delay *= 2;
	}

	/* free host name */
	free(host); /* strdup-ed */

	return rc;
}