Пример #1
0
static void psmx_set_epaddr_context(struct psmx_fid_domain *domain,
				    psm_epid_t epid, psm_epaddr_t epaddr)
{
	struct psmx_epaddr_context *context;

	context = (void *)psm_epaddr_getctxt(epaddr);
	if (context) {
		if (context->domain != domain || context->epid != epid) {
			FI_WARN(&psmx_prov, FI_LOG_AV,
				"domain or epid doesn't match\n");
			context = NULL;
		}
	}

	if (context)
		return;

	context = malloc(sizeof *context);
	if (!context) {
		FI_WARN(&psmx_prov, FI_LOG_AV,
			"cannot allocate context\n");
		return;
	}

	context->domain = domain;
	context->epid = epid;
	psm_epaddr_setctxt(epaddr, context);
}
Пример #2
0
int psmx_epid_to_epaddr(struct psmx_fid_domain *domain,
			psm_epid_t epid, psm_epaddr_t *epaddr)
{
        int err;
        psm_error_t errors;
	psm_epconn_t epconn;
	struct psmx_epaddr_context *context;

	err = psm_ep_epid_lookup(epid, &epconn);
	if (err == PSM_OK) {
		context = psm_epaddr_getctxt(epconn.addr);
		if (context && context->epid  == epid) {
			*epaddr = epconn.addr;
			return 0;
		}
	}

        err = psm_ep_connect(domain->psm_ep, 1, &epid, NULL, &errors, epaddr, 30*1e9);
        if (err != PSM_OK)
                return psmx_errno(err);

	psmx_set_epaddr_context(domain,epid,*epaddr);

        return 0;
}
Пример #3
0
static void psmx_set_epaddr_context(struct psmx_fid_domain *domain,
				    psm_epid_t epid, psm_epaddr_t epaddr)
{
	struct psmx_epaddr_context *context;

	context = (void *)psm_epaddr_getctxt(epaddr);
	if (context) {
		if (context->domain != domain || context->epid != epid) {
			fprintf(stderr, "%s: domain or epid doesn't match\n", __func__);
			context = NULL;
		}
	}

	if (context)
		return;

	context = malloc(sizeof *context);
	if (!context) {
		fprintf(stderr, "%s: cannot allocate context\n", __func__);
		return;
	}

	context->domain = domain;
	context->epid = epid;
	psm_epaddr_setctxt(epaddr, context);
}
Пример #4
0
static int psmx_av_lookup(struct fid_av *av, fi_addr_t fi_addr, void *addr,
			  size_t *addrlen)
{
	struct psmx_fid_av *av_priv;
	struct psmx_epaddr_context *context;
	psm_epid_t epid;
	int idx;

	if (!addr || !addrlen)
		return -FI_EINVAL;

	av_priv = container_of(av, struct psmx_fid_av, av);

	if (av_priv->type == FI_AV_TABLE) {
		idx = (int)(int64_t)fi_addr;
		if (idx >= av_priv->last)
			return -FI_EINVAL;

		epid = av_priv->psm_epids[idx];
	}
	else {
		context = psm_epaddr_getctxt((void *)fi_addr);
		epid = context->epid;
	}

	if (*addrlen >= sizeof(epid))
		*(psm_epid_t *)addr = epid;
	else
		memcpy(addr, &epid, *addrlen);
	*addrlen = sizeof(epid);

	return 0;
}
Пример #5
0
ssize_t _psmx_writeto(struct fid_ep *ep, const void *buf, size_t len,
		      void *desc, fi_addr_t dest_addr,
		      uint64_t addr, uint64_t key, void *context,
		      uint64_t flags, uint64_t data)
{
	struct psmx_fid_ep *ep_priv;
	struct psmx_fid_av *av;
	struct psmx_epaddr_context *epaddr_context;
	struct psmx_am_request *req;
	psm_amarg_t args[8];
	int nargs;
	int am_flags = PSM_AM_FLAG_ASYNC;
	int err;
	int chunk_size;
	psm_mq_req_t psm_req;
	uint64_t psm_tag;
	size_t idx;

	if (flags & FI_TRIGGER) {
		struct psmx_trigger *trigger;
		struct fi_triggered_context *ctxt = context;

		trigger = calloc(1, sizeof(*trigger));
		if (!trigger)
			return -ENOMEM;

		trigger->op = PSMX_TRIGGERED_WRITE;
		trigger->cntr = container_of(ctxt->threshold.cntr,
					     struct psmx_fid_cntr, cntr);
		trigger->threshold = ctxt->threshold.threshold;
		trigger->write.ep = ep;
		trigger->write.buf = buf;
		trigger->write.len = len;
		trigger->write.desc = desc;
		trigger->write.dest_addr = dest_addr;
		trigger->write.addr = addr;
		trigger->write.key = key;
		trigger->write.context = context;
		trigger->write.flags = flags & ~FI_TRIGGER;
		trigger->write.data = data;

		psmx_cntr_add_trigger(trigger->cntr, trigger);
		return 0;
	}

	ep_priv = container_of(ep, struct psmx_fid_ep, ep);
	assert(ep_priv->domain);

	if (!buf)
		return -EINVAL;

	av = ep_priv->av;
	if (av && av->type == FI_AV_TABLE) {
		idx = dest_addr;
		if (idx >= av->last)
			return -EINVAL;

		dest_addr = (fi_addr_t) av->psm_epaddrs[idx];
	}
	else if (!dest_addr) {
		return -EINVAL;
	}

	epaddr_context = psm_epaddr_getctxt((void *)dest_addr);
	if (epaddr_context->epid == ep_priv->domain->psm_epid)
		return psmx_rma_self(PSMX_AM_REQ_WRITE,
				     ep_priv, (void *)buf, len, desc,
				     addr, key, context, flags, data);

	if (flags & FI_INJECT) {
		req = malloc(sizeof(*req) + len);
		if (!req)
			return -ENOMEM;

		memset((void *)req, 0, sizeof(*req));
		memcpy((void *)req + sizeof(*req), (void *)buf, len);
		buf = (void *)req + sizeof(*req);

		PSMX_CTXT_TYPE(&req->fi_context) = PSMX_INJECT_WRITE_CONTEXT;
		req->no_event = 1;
	}
	else {
		req = calloc(1, sizeof(*req));
		if (!req)
			return -ENOMEM;

		if (ep_priv->send_cq_event_flag && !(flags & FI_EVENT)) {
			PSMX_CTXT_TYPE(&req->fi_context) = PSMX_NOCOMP_WRITE_CONTEXT;
			req->no_event = 1;
		}
		else {
			PSMX_CTXT_TYPE(&req->fi_context) = PSMX_WRITE_CONTEXT;
		}
	}

	req->op = PSMX_AM_REQ_WRITE;
	req->write.buf = (void *)buf;
	req->write.len = len;
	req->write.addr = addr;	/* needed? */
	req->write.key = key; 	/* needed? */
	req->write.context = context;
	req->ep = ep_priv;
	PSMX_CTXT_USER(&req->fi_context) = context;

	chunk_size = MIN(PSMX_AM_CHUNK_SIZE, psmx_am_param.max_request_short);

	if (psmx_env.tagged_rma && len > chunk_size) {
		psm_tag = PSMX_RMA_BIT | ep_priv->domain->psm_epid;
		args[0].u32w0 = PSMX_AM_REQ_WRITE_LONG;
		args[0].u32w1 = len;
		args[1].u64 = (uint64_t)req;
		args[2].u64 = addr;
		args[3].u64 = key;
		args[4].u64 = psm_tag;
		nargs = 5;
		if (flags & FI_REMOTE_CQ_DATA) {
			args[5].u64 = data;
			args[0].u32w0 |= PSMX_AM_DATA;
			nargs++;
		}
		err = psm_am_request_short((psm_epaddr_t) dest_addr,
					PSMX_AM_RMA_HANDLER, args, nargs,
					NULL, 0, am_flags | PSM_AM_FLAG_NOREPLY,
					NULL, NULL);

		psm_mq_isend(ep_priv->domain->psm_mq, (psm_epaddr_t) dest_addr,
				0, psm_tag, buf, len, (void *)&req->fi_context, &psm_req);

		return 0;
	}

	nargs = 4;
	while (len > chunk_size) {
		args[0].u32w0 = PSMX_AM_REQ_WRITE;
		args[0].u32w1 = chunk_size;
		args[1].u64 = (uint64_t)(uintptr_t)req;
		args[2].u64 = addr;
		args[3].u64 = key;
		err = psm_am_request_short((psm_epaddr_t) dest_addr,
					PSMX_AM_RMA_HANDLER, args, nargs,
					(void *)buf, chunk_size,
					am_flags | PSM_AM_FLAG_NOREPLY, NULL, NULL);
		buf += chunk_size;
		addr += chunk_size;
		len -= chunk_size;
	}

	args[0].u32w0 = PSMX_AM_REQ_WRITE | PSMX_AM_EOM;
	args[0].u32w1 = len;
	args[1].u64 = (uint64_t)(uintptr_t)req;
	args[2].u64 = addr;
	args[3].u64 = key;
	if (flags & FI_REMOTE_CQ_DATA) {
		args[4].u64 = data;
		args[0].u32w0 |= PSMX_AM_DATA;
		nargs++;
	}
	err = psm_am_request_short((psm_epaddr_t) dest_addr,
				PSMX_AM_RMA_HANDLER, args, nargs,
				(void *)buf, len, am_flags, NULL, NULL);

	return 0;
}
Пример #6
0
ssize_t _psmx_readfrom(struct fid_ep *ep, void *buf, size_t len,
		       void *desc, fi_addr_t src_addr,
		       uint64_t addr, uint64_t key, void *context,
		       uint64_t flags)
{
	struct psmx_fid_ep *ep_priv;
	struct psmx_fid_av *av;
	struct psmx_epaddr_context *epaddr_context;
	struct psmx_am_request *req;
	psm_amarg_t args[8];
	int err;
	int chunk_size;
	size_t offset = 0;
	uint64_t psm_tag;
	psm_mq_req_t psm_req;
	size_t idx;

	if (flags & FI_TRIGGER) {
		struct psmx_trigger *trigger;
		struct fi_triggered_context *ctxt = context;

		trigger = calloc(1, sizeof(*trigger));
		if (!trigger)
			return -ENOMEM;

		trigger->op = PSMX_TRIGGERED_READ;
		trigger->cntr = container_of(ctxt->threshold.cntr,
					     struct psmx_fid_cntr, cntr);
		trigger->threshold = ctxt->threshold.threshold;
		trigger->read.ep = ep;
		trigger->read.buf = buf;
		trigger->read.len = len;
		trigger->read.desc = desc;
		trigger->read.src_addr = src_addr;
		trigger->read.addr = addr;
		trigger->read.key = key;
		trigger->read.context = context;
		trigger->read.flags = flags & ~FI_TRIGGER;

		psmx_cntr_add_trigger(trigger->cntr, trigger);
		return 0;
	}

	ep_priv = container_of(ep, struct psmx_fid_ep, ep);
	assert(ep_priv->domain);

	if (!buf)
		return -EINVAL;

	av = ep_priv->av;
	if (av && av->type == FI_AV_TABLE) {
		idx = src_addr;
		if (idx >= av->last)
			return -EINVAL;

		src_addr = (fi_addr_t) av->psm_epaddrs[idx];
	}
	else if (!src_addr) {
		return -EINVAL;
	}

	epaddr_context = psm_epaddr_getctxt((void *)src_addr);
	if (epaddr_context->epid == ep_priv->domain->psm_epid)
		return psmx_rma_self(PSMX_AM_REQ_READ,
				     ep_priv, buf, len, desc,
				     addr, key, context, flags, 0);

	req = calloc(1, sizeof(*req));
	if (!req)
		return -ENOMEM;

	req->op = PSMX_AM_REQ_READ;
	req->read.buf = buf;
	req->read.len = len;
	req->read.addr = addr;	/* needed? */
	req->read.key = key; 	/* needed? */
	req->read.context = context;
	req->ep = ep_priv;
	PSMX_CTXT_TYPE(&req->fi_context) = PSMX_READ_CONTEXT;
	PSMX_CTXT_USER(&req->fi_context) = context;

	if (ep_priv->send_cq_event_flag && !(flags & FI_EVENT)) {
		PSMX_CTXT_TYPE(&req->fi_context) = PSMX_NOCOMP_READ_CONTEXT;
		req->no_event = 1;
	}

	chunk_size = MIN(PSMX_AM_CHUNK_SIZE, psmx_am_param.max_reply_short);

	if (psmx_env.tagged_rma && len > chunk_size) {
		psm_tag = PSMX_RMA_BIT | ep_priv->domain->psm_epid;
		err = psm_mq_irecv(ep_priv->domain->psm_mq, psm_tag, -1ULL,
			0, buf, len, (void *)&req->fi_context, &psm_req);

		args[0].u32w0 = PSMX_AM_REQ_READ_LONG;
		args[0].u32w1 = len;
		args[1].u64 = (uint64_t)req;
		args[2].u64 = addr;
		args[3].u64 = key;
		args[4].u64 = psm_tag;
		err = psm_am_request_short((psm_epaddr_t) src_addr,
					PSMX_AM_RMA_HANDLER, args, 5, NULL, 0,
					PSM_AM_FLAG_NOREPLY, NULL, NULL);

		return 0;
	}

	args[0].u32w0 = PSMX_AM_REQ_READ;
	args[1].u64 = (uint64_t)(uintptr_t)req;
	args[3].u64 = key;
	while (len > chunk_size) {
		args[0].u32w1 = chunk_size;
		args[2].u64 = addr;
		args[4].u64 = offset;
		err = psm_am_request_short((psm_epaddr_t) src_addr,
					PSMX_AM_RMA_HANDLER, args, 5, NULL, 0,
					0, NULL, NULL);
		addr += chunk_size;
		len -= chunk_size;
		offset += chunk_size;
	}

	args[0].u32w0 = PSMX_AM_REQ_READ | PSMX_AM_EOM;
	args[0].u32w1 = len;
	args[2].u64 = addr;
	args[4].u64 = offset;
	err = psm_am_request_short((psm_epaddr_t) src_addr,
				PSMX_AM_RMA_HANDLER, args, 5, NULL, 0,
				0, NULL, NULL);

	return 0;
}
Пример #7
0
static int psmx_av_insert(struct fid_av *av, const void *addr, size_t count,
			  fi_addr_t *fi_addr, uint64_t flags, void *context)
{
	struct psmx_fid_av *av_priv;
	psm_error_t *errors;
	int error_count = 0;
	int *mask;
	int i, j;
	fi_addr_t *result = NULL;
	struct psmx_epaddr_context *epaddr_context;

	av_priv = container_of(av, struct psmx_fid_av, av);

	errors = (psm_error_t *) calloc(count, sizeof *errors);
	if (!errors)
		return -FI_ENOMEM;

	mask = (int *) calloc(count, sizeof *mask);
	if (!mask) {
		free(errors);
		return -FI_ENOMEM;
	}

	if (av_priv->type == FI_AV_TABLE) {
		if (psmx_av_check_table_size(av_priv, count)) {
			free(mask);
			free(errors);
			return -FI_ENOMEM;
		}

		for (i=0; i<count; i++)
			av_priv->psm_epids[av_priv->last + i] = ((psm_epid_t *)addr)[i];

		result = fi_addr;
		addr = (const void *)(av_priv->psm_epids + av_priv->last);
		fi_addr = (fi_addr_t *)(av_priv->psm_epaddrs + av_priv->last);
	}

	/* prevent connecting to the same ep twice, which is fatal in PSM */
	for (i=0; i<count; i++) {
		psm_epconn_t epconn;
		if (psm_ep_epid_lookup(((psm_epid_t *) addr)[i], &epconn) == PSM_OK) {
			epaddr_context = psm_epaddr_getctxt(epconn.addr);
			if (epaddr_context && epaddr_context->epid  == ((psm_epid_t *) addr)[i])
				((psm_epaddr_t *) fi_addr)[i] = epconn.addr;
			else
				mask[i] = 1;
		}
		else {
			mask[i] = 1;
		}
	}

	psm_ep_connect(av_priv->domain->psm_ep, count, 
			(psm_epid_t *) addr, mask, errors,
			(psm_epaddr_t *) fi_addr, 30*1e9);

	for (i=0; i<count; i++){
		if (!mask[i])
			continue;

		if (errors[i] == PSM_OK || errors[i] == PSM_EPID_ALREADY_CONNECTED) {
			psmx_set_epaddr_context(av_priv->domain,
						((psm_epid_t *) addr)[i],
						((psm_epaddr_t *) fi_addr)[i]);
		}
		else {
			FI_INFO(&psmx_prov, FI_LOG_AV,
				"%d: psm_ep_connect returned %s. remote epid=%lx.\n",
				i, psm_error_get_string(errors[i]),
				((psm_epid_t *)addr)[i]);
			if (((psm_epid_t *)addr)[i] == 0)
				FI_INFO(&psmx_prov, FI_LOG_AV,
					"does the application depend on the provider"
					"to resolve IP address into endpoint id? if so"
					"check if the name server has started correctly"
					"at the other side.\n");
			fi_addr[i] = FI_ADDR_NOTAVAIL;
			error_count++;
		}
	}

	free(mask);
	free(errors);

	if (av_priv->type == FI_AV_TABLE) {
		/* NOTE: unresolved addresses are left in the AV table */
		if (result) {
			for (i=0; i<count; i++) {
				j = av_priv->last + i;
				if ((fi_addr_t)av_priv->psm_epaddrs[j] == FI_ADDR_NOTAVAIL)
					result[i] = FI_ADDR_NOTAVAIL;
				else
					result[i] = j;
			}
		}
		av_priv->last += count;
	}

	return count - error_count;
}
Пример #8
0
ssize_t _psmx_atomic_readwrite(struct fid_ep *ep,
				const void *buf,
				size_t count, void *desc,
				void *result, void *result_desc,
				fi_addr_t dest_addr,
				uint64_t addr, uint64_t key,
				enum fi_datatype datatype,
				enum fi_op op, void *context,
				uint64_t flags)
{
	struct psmx_fid_ep *ep_priv;
	struct psmx_fid_av *av;
	struct psmx_epaddr_context *epaddr_context;
	struct psmx_am_request *req;
	psm_amarg_t args[8];
	int am_flags = PSM_AM_FLAG_ASYNC;
	int chunk_size, len;
	size_t idx;

	ep_priv = container_of(ep, struct psmx_fid_ep, ep);

	if (flags & FI_TRIGGER) {
		struct psmx_trigger *trigger;
		struct fi_triggered_context *ctxt = context;

		trigger = calloc(1, sizeof(*trigger));
		if (!trigger)
			return -FI_ENOMEM;

		trigger->op = PSMX_TRIGGERED_ATOMIC_READWRITE;
		trigger->cntr = container_of(ctxt->trigger.threshold.cntr,
					     struct psmx_fid_cntr, cntr);
		trigger->threshold = ctxt->trigger.threshold.threshold;
		trigger->atomic_readwrite.ep = ep;
		trigger->atomic_readwrite.buf = buf;
		trigger->atomic_readwrite.count = count;
		trigger->atomic_readwrite.desc = desc;
		trigger->atomic_readwrite.result = result;
		trigger->atomic_readwrite.result_desc = result_desc;
		trigger->atomic_readwrite.dest_addr = dest_addr;
		trigger->atomic_readwrite.addr = addr;
		trigger->atomic_readwrite.key = key;
		trigger->atomic_readwrite.datatype = datatype;
		trigger->atomic_readwrite.atomic_op = op;
		trigger->atomic_readwrite.context = context;
		trigger->atomic_readwrite.flags = flags & ~FI_TRIGGER;

		psmx_cntr_add_trigger(trigger->cntr, trigger);
		return 0;
	}

	if (!buf && op != FI_ATOMIC_READ)
		return -FI_EINVAL;

	if (datatype < 0 || datatype >= FI_DATATYPE_LAST)
		return -FI_EINVAL;

	if (op < 0 || op >= FI_ATOMIC_OP_LAST)
		return -FI_EINVAL;

	av = ep_priv->av;
	if (av && av->type == FI_AV_TABLE) {
		idx = dest_addr;
		if (idx >= av->last)
			return -FI_EINVAL;

		dest_addr = (fi_addr_t) av->psm_epaddrs[idx];
	}
	else if (!dest_addr) {
		return -FI_EINVAL;
	}

	epaddr_context = psm_epaddr_getctxt((void *)dest_addr);
	if (epaddr_context->epid == ep_priv->domain->psm_epid)
		return psmx_atomic_self(PSMX_AM_REQ_ATOMIC_READWRITE,
					ep_priv, buf, count, desc,
					NULL, NULL, result, result_desc,
					addr, key, datatype, op,
					context, flags);

	chunk_size = MIN(PSMX_AM_CHUNK_SIZE, psmx_am_param.max_request_short);
	len = fi_datatype_size(datatype) * count;
	if (len > chunk_size)
		return -FI_EMSGSIZE;

	if ((flags & FI_INJECT) && op != FI_ATOMIC_READ) {
		req = malloc(sizeof(*req) + len);
		if (!req)
			return -FI_ENOMEM;
		memset((void *)req, 0, sizeof(*req));
		memcpy((void *)req+sizeof(*req), (void *)buf, len);
		buf = (void *)req + sizeof(*req);
	}
	else {
		req = calloc(1, sizeof(*req));
		if (!req)
			return -FI_ENOMEM;
	}

	req->no_event = (flags & PSMX_NO_COMPLETION) ||
			(ep_priv->send_selective_completion && !(flags & FI_COMPLETION));

	req->op = PSMX_AM_REQ_ATOMIC_READWRITE;
	req->atomic.buf = (void *)buf;
	req->atomic.len = len;
	req->atomic.addr = addr;
	req->atomic.key = key;
	req->atomic.context = context;
	req->atomic.result = result;
	req->ep = ep_priv;
	if (op == FI_ATOMIC_READ)
		req->cq_flags = FI_READ | FI_ATOMIC;
	else
		req->cq_flags = FI_WRITE | FI_ATOMIC;

	args[0].u32w0 = PSMX_AM_REQ_ATOMIC_READWRITE;
	args[0].u32w1 = count;
	args[1].u64 = (uint64_t)(uintptr_t)req;
	args[2].u64 = addr;
	args[3].u64 = key;
	args[4].u32w0 = datatype;
	args[4].u32w1 = op;
	psm_am_request_short((psm_epaddr_t) dest_addr,
				PSMX_AM_ATOMIC_HANDLER, args, 5,
				(void *)buf, (buf?len:0), am_flags, NULL, NULL);

	return 0;
}
Пример #9
0
static int psmx_av_insert(struct fid_av *av, const void *addr, size_t count,
			  fi_addr_t *fi_addr, uint64_t flags, void *context)
{
	struct psmx_fid_av *av_priv;
	psm_error_t *errors;
	int error_count = 0;
	int *mask;
	int i, j, ret;
	fi_addr_t *result = NULL;
	struct psmx_epaddr_context *epaddr_context;

	if (count && !addr) {
		FI_INFO(&psmx_prov, FI_LOG_AV,
			"the input address array is NULL.\n");
		return -FI_EINVAL;
	}

	av_priv = container_of(av, struct psmx_fid_av, av);

	if ((av_priv->flags & FI_EVENT) && !av_priv->eq)
		return -FI_ENOEQ;

	errors = (psm_error_t *) calloc(count, sizeof *errors);
	if (!errors)
		return -FI_ENOMEM;

	mask = (int *) calloc(count, sizeof *mask);
	if (!mask) {
		free(errors);
		return -FI_ENOMEM;
	}

	if (av_priv->type == FI_AV_TABLE) {
		if (psmx_av_check_table_size(av_priv, count)) {
			free(mask);
			free(errors);
			return -FI_ENOMEM;
		}

		for (i=0; i<count; i++)
			av_priv->psm_epids[av_priv->last + i] = ((psm_epid_t *)addr)[i];

		result = fi_addr;
		addr = (const void *)(av_priv->psm_epids + av_priv->last);
		fi_addr = (fi_addr_t *)(av_priv->psm_epaddrs + av_priv->last);
	}

	/* prevent connecting to the same ep twice, which is fatal in PSM */
	for (i=0; i<count; i++) {
		psm_epconn_t epconn;
		if (psm_ep_epid_lookup(((psm_epid_t *) addr)[i], &epconn) == PSM_OK) {
			epaddr_context = psm_epaddr_getctxt(epconn.addr);
			if (epaddr_context && epaddr_context->epid  == ((psm_epid_t *) addr)[i])
				((psm_epaddr_t *) fi_addr)[i] = epconn.addr;
			else
				mask[i] = 1;
		} else {
			mask[i] = 1;
		}
	}

	psm_ep_connect(av_priv->domain->psm_ep, count, 
			(psm_epid_t *) addr, mask, errors,
			(psm_epaddr_t *) fi_addr, 30*1e9);

	for (i=0; i<count; i++){
		if (!mask[i]) {
			errors[i] = PSM_OK;
			continue;
		}

		if (errors[i] == PSM_OK || errors[i] == PSM_EPID_ALREADY_CONNECTED) {
			psmx_set_epaddr_context(av_priv->domain,
						((psm_epid_t *) addr)[i],
						((psm_epaddr_t *) fi_addr)[i]);
			errors[i] = PSM_OK;
		} else {
			psm_epconn_t epconn;

			/* If duplicated addresses are passed to psm_ep_connect(), all but one will fail
			 * with error "Endpoint could not be reached". They should be treated as already
			 * connected.
			 */
			if (psm_ep_epid_lookup(((psm_epid_t *) addr)[i], &epconn) == PSM_OK) {
				epaddr_context = psm_epaddr_getctxt(epconn.addr);
				if (epaddr_context && epaddr_context->epid  == ((psm_epid_t *) addr)[i]) {
					((psm_epaddr_t *) fi_addr)[i] = epconn.addr;
					errors[i] = PSM_OK;
					continue;
				}
			}

			FI_INFO(&psmx_prov, FI_LOG_AV,
				"%d: psm_ep_connect returned %s. remote epid=%lx.\n",
				i, psm_error_get_string(errors[i]),
				((psm_epid_t *)addr)[i]);
			if (((psm_epid_t *)addr)[i] == 0)
				FI_INFO(&psmx_prov, FI_LOG_AV,
					"does the application depend on the provider"
					"to resolve IP address into endpoint id? if so"
					"check if the name server has started correctly"
					"at the other side.\n");
			fi_addr[i] = FI_ADDR_NOTAVAIL;
			error_count++;

			if (av_priv->flags & FI_EVENT)
				psmx_av_post_completion(av_priv, context, i, errors[i]);
		}
	}

	if (av_priv->type == FI_AV_TABLE) {
		/* NOTE: unresolved addresses are left in the AV table */
		if (result) {
			for (i=0; i<count; i++) {
				j = av_priv->last + i;
				if ((fi_addr_t)av_priv->psm_epaddrs[j] == FI_ADDR_NOTAVAIL)
					result[i] = FI_ADDR_NOTAVAIL;
				else
					result[i] = j;
			}
		}
		av_priv->last += count;
	}

	if (av_priv->flags & FI_EVENT) {
		psmx_av_post_completion(av_priv, context, count - error_count, 0);
		ret = 0;
	} else {
		if (flags & FI_SYNC_ERR) {
			int *fi_errors = context;
			for (i=0; i<count; i++)
				fi_errors[i] = psmx_errno(errors[i]);
		}
		ret = count - error_count;
	}

	free(mask);
	free(errors);
	return ret;
}
Пример #10
0
ssize_t _psmx_write(struct fid_ep *ep, const void *buf, size_t len,
		      void *desc, fi_addr_t dest_addr,
		      uint64_t addr, uint64_t key, void *context,
		      uint64_t flags, uint64_t data)
{
	struct psmx_fid_ep *ep_priv;
	struct psmx_fid_av *av;
	struct psmx_epaddr_context *epaddr_context;
	struct psmx_am_request *req;
	psm_amarg_t args[8];
	int nargs;
	int am_flags = PSM_AM_FLAG_ASYNC;
	int chunk_size;
	psm_mq_req_t psm_req;
	uint64_t psm_tag;
	size_t idx;
	void *psm_context;
	int no_event;

	ep_priv = container_of(ep, struct psmx_fid_ep, ep);

	if (flags & FI_TRIGGER) {
		struct psmx_trigger *trigger;
		struct fi_triggered_context *ctxt = context;

		trigger = calloc(1, sizeof(*trigger));
		if (!trigger)
			return -FI_ENOMEM;

		trigger->op = PSMX_TRIGGERED_WRITE;
		trigger->cntr = container_of(ctxt->trigger.threshold.cntr,
					     struct psmx_fid_cntr, cntr);
		trigger->threshold = ctxt->trigger.threshold.threshold;
		trigger->write.ep = ep;
		trigger->write.buf = buf;
		trigger->write.len = len;
		trigger->write.desc = desc;
		trigger->write.dest_addr = dest_addr;
		trigger->write.addr = addr;
		trigger->write.key = key;
		trigger->write.context = context;
		trigger->write.flags = flags & ~FI_TRIGGER;
		trigger->write.data = data;

		psmx_cntr_add_trigger(trigger->cntr, trigger);
		return 0;
	}

	if (!buf)
		return -FI_EINVAL;

	av = ep_priv->av;
	if (av && av->type == FI_AV_TABLE) {
		idx = dest_addr;
		if (idx >= av->last)
			return -FI_EINVAL;

		dest_addr = (fi_addr_t) av->psm_epaddrs[idx];
	}
	else if (!dest_addr) {
		return -FI_EINVAL;
	}

	epaddr_context = psm_epaddr_getctxt((void *)dest_addr);
	if (epaddr_context->epid == ep_priv->domain->psm_epid)
		return psmx_rma_self(PSMX_AM_REQ_WRITE,
				     ep_priv, (void *)buf, len, desc,
				     addr, key, context, flags, data);

	no_event = (flags & PSMX_NO_COMPLETION) ||
		   (ep_priv->send_selective_completion && !(flags & FI_COMPLETION));

	if (flags & FI_INJECT) {
		if (len > PSMX_INJECT_SIZE)
			return -FI_EMSGSIZE;

		req = malloc(sizeof(*req) + len);
		if (!req)
			return -FI_ENOMEM;

		memset((void *)req, 0, sizeof(*req));
		memcpy((void *)req + sizeof(*req), (void *)buf, len);
		buf = (void *)req + sizeof(*req);
	}
	else {
		req = calloc(1, sizeof(*req));
		if (!req)
			return -FI_ENOMEM;

		PSMX_CTXT_TYPE(&req->fi_context) = no_event ?
						   PSMX_NOCOMP_WRITE_CONTEXT :
						   PSMX_WRITE_CONTEXT;
	}

	req->no_event = no_event;
	req->op = PSMX_AM_REQ_WRITE;
	req->write.buf = (void *)buf;
	req->write.len = len;
	req->write.addr = addr;	/* needed? */
	req->write.key = key; 	/* needed? */
	req->write.context = context;
	req->ep = ep_priv;
	req->cq_flags = FI_WRITE | FI_RMA;
	PSMX_CTXT_USER(&req->fi_context) = context;
	PSMX_CTXT_EP(&req->fi_context) = ep_priv;

	chunk_size = MIN(PSMX_AM_CHUNK_SIZE, psmx_am_param.max_request_short);

	if (psmx_env.tagged_rma && len > chunk_size) {
		void *payload = NULL;
		int payload_len = 0;

		psm_tag = PSMX_RMA_BIT | ep_priv->domain->psm_epid;
		args[0].u32w0 = PSMX_AM_REQ_WRITE_LONG;
		args[0].u32w1 = len;
		args[1].u64 = (uint64_t)req;
		args[2].u64 = addr;
		args[3].u64 = key;
		args[4].u64 = psm_tag;
		nargs = 5;
		if (flags & FI_REMOTE_CQ_DATA) {
			args[0].u32w0 |= PSMX_AM_DATA;
			payload = &data;
			payload_len = sizeof(data);
			am_flags = 0;
		}

		if (flags & FI_DELIVERY_COMPLETE) {
			args[0].u32w0 |= PSMX_AM_FORCE_ACK;
			psm_context = NULL;
		}
		else {
			psm_context = (void *)&req->fi_context;
		}

		/* NOTE: if nargs is greater than 5, the following psm_mq_isend
		 * would hang if the destination is on the same node (i.e. going
		 * through the shared memory path). As the result, the immediate
		 * data is sent as payload instead of args[5].
		 */
		psm_am_request_short((psm_epaddr_t) dest_addr,
					PSMX_AM_RMA_HANDLER, args, nargs,
					payload, payload_len, am_flags,
					NULL, NULL);

		psm_mq_isend(ep_priv->domain->psm_mq, (psm_epaddr_t) dest_addr,
				0, psm_tag, buf, len, psm_context, &psm_req);

		return 0;
	}

	nargs = 4;
	while (len > chunk_size) {
		args[0].u32w0 = PSMX_AM_REQ_WRITE;
		args[0].u32w1 = chunk_size;
		args[1].u64 = (uint64_t)(uintptr_t)req;
		args[2].u64 = addr;
		args[3].u64 = key;
		psm_am_request_short((psm_epaddr_t) dest_addr,
					PSMX_AM_RMA_HANDLER, args, nargs,
					(void *)buf, chunk_size,
					am_flags, NULL, NULL);
		buf += chunk_size;
		addr += chunk_size;
		len -= chunk_size;
	}

	args[0].u32w0 = PSMX_AM_REQ_WRITE | PSMX_AM_EOM;
	args[0].u32w1 = len;
	args[1].u64 = (uint64_t)(uintptr_t)req;
	args[2].u64 = addr;
	args[3].u64 = key;
	if (flags & FI_REMOTE_CQ_DATA) {
		args[4].u64 = data;
		args[0].u32w0 |= PSMX_AM_DATA;
		nargs++;
	}
	psm_am_request_short((psm_epaddr_t) dest_addr,
				PSMX_AM_RMA_HANDLER, args, nargs,
				(void *)buf, len, am_flags, NULL, NULL);

	return 0;
}
Пример #11
0
static int psmx_av_insert(struct fid_av *av, const void *addr, size_t count,
			  fi_addr_t *fi_addr, uint64_t flags)
{
	struct psmx_fid_av *fid_av;
	psm_error_t *errors;
	int *mask;
	int err;
	int i;
	fi_addr_t *result = NULL;
	struct psmx_epaddr_context *context;

	fid_av = container_of(av, struct psmx_fid_av, av);

	/* TODO: support the FI_RANGE flag */
	if (flags)
		return -FI_EBADFLAGS;

	errors = (psm_error_t *) calloc(count, sizeof *errors);
	if (!errors)
		return -ENOMEM;

	mask = (int *) calloc(count, sizeof *mask);
	if (!mask) {
		free(errors);
		return -ENOMEM;
	}

	if (fid_av->type == FI_AV_TABLE) {
		if (psmx_av_check_table_size(fid_av, count)) {
			free(mask);
			free(errors);
			return -ENOMEM;
		}

		for (i=0; i<count; i++)
			fid_av->psm_epids[fid_av->last + i] = ((psm_epid_t *)addr)[i];

		result = fi_addr;
		addr = (const void *)(fid_av->psm_epids + fid_av->last);
		fi_addr = (fi_addr_t *)(fid_av->psm_epaddrs + fid_av->last);
	}

	/* prevent connecting to the same ep twice, which is fatal in PSM */
	for (i=0; i<count; i++) {
		psm_epconn_t epconn;
		if (psm_ep_epid_lookup(((psm_epid_t *) addr)[i], &epconn) == PSM_OK) {
			context = psm_epaddr_getctxt(epconn.addr);
			if (context && context->epid  == ((psm_epid_t *) addr)[i])
				((psm_epaddr_t *) fi_addr)[i] = epconn.addr;
			else
				mask[i] = 1;
		}
		else {
			mask[i] = 1;
		}
	}

	err = psm_ep_connect(fid_av->domain->psm_ep, count, 
			(psm_epid_t *) addr, mask, errors,
			(psm_epaddr_t *) fi_addr, 30*1e9);

	for (i=0; i<count; i++){
		if (mask[i] && errors[i] == PSM_OK) {
			psmx_set_epaddr_context(fid_av->domain,
						((psm_epid_t *) addr)[i],
						((psm_epaddr_t *) fi_addr)[i]);
		}
	}

	free(mask);
	free(errors);

	if (fid_av->type == FI_AV_TABLE) {
		if (result) {
			for (i=0; i<count; i++)
				result[i] = fid_av->last + i;
		}
		fid_av->last += count;
	}

	return psmx_errno(err);
}
Пример #12
0
int psmx_am_msg_handler(psm_am_token_t token, psm_epaddr_t epaddr,
			psm_amarg_t *args, int nargs, void *src, uint32_t len)
#endif
{
        psm_amarg_t rep_args[8];
        struct psmx_am_request *req;
        struct psmx_cq_event *event;
	struct psmx_epaddr_context *epaddr_context;
	struct psmx_fid_domain *domain;
	int copy_len;
	uint64_t offset;
	int cmd, eom;
	int err = 0;
	int op_error = 0;
	struct psmx_unexp *unexp;
#if (PSM_VERNO_MAJOR >= 2)
	psm_epaddr_t epaddr;

	psm_am_get_source(token, &epaddr);
#endif

	epaddr_context = psm_epaddr_getctxt(epaddr);
	if (!epaddr_context) {
		FI_WARN(&psmx_prov, FI_LOG_EP_DATA,
			"NULL context for epaddr %p\n", epaddr);
		return -FI_EIO;
	}

	domain = epaddr_context->domain;

	cmd = args[0].u32w0 & PSMX_AM_OP_MASK;
	eom = args[0].u32w0 & PSMX_AM_EOM;

	switch (cmd) {
	case PSMX_AM_REQ_SEND:
		assert(len == args[0].u32w1);
                offset = args[3].u64;
		if (offset == 0) {
			/* this is the first packet */
			req = psmx_am_search_and_dequeue_recv(domain, (const void *)epaddr);
			if (req) {
				copy_len = MIN(len, req->recv.len);
				memcpy(req->recv.buf, src, len);
				req->recv.len_received += copy_len;
			}
			else {
				unexp = malloc(sizeof(*unexp) + len);
				if (!unexp) {
					op_error = -FI_ENOSPC;
				}
				else  {
					memcpy(unexp->buf, src, len);
					unexp->sender_addr = epaddr;
					unexp->sender_context = args[1].u64;
					unexp->len_received = len;
					unexp->done = !!eom;
					unexp->list_entry.next = NULL;
					psmx_am_enqueue_unexp(domain, unexp);

					if (!eom) {
						/* stop here. will reply when recv is posted */
						break;
					}
				}
			}

			if (!op_error && !eom) {
				/* reply w/ recv req to be used for following packets */
				rep_args[0].u32w0 = PSMX_AM_REP_SEND;
				rep_args[0].u32w1 = 0;
				rep_args[1].u64 = args[1].u64;
				rep_args[2].u64 = (uint64_t)(uintptr_t)req;
				err = psm_am_reply_short(token, PSMX_AM_MSG_HANDLER,
						rep_args, 3, NULL, 0, 0,
						NULL, NULL );
			}
		}
		else {
			req = (struct psmx_am_request *)(uintptr_t)args[2].u64;
			if (req) {
				copy_len = MIN(req->recv.len + offset, len);
				memcpy(req->recv.buf + offset, src, copy_len);
				req->recv.len_received += copy_len;
			}
			else {
				FI_WARN(&psmx_prov, FI_LOG_EP_DATA,
					"NULL recv_req in follow-up packets.\n");
				op_error = -FI_ENOMSG;
			}
		}

		if (eom && req) {
			if (req->ep->recv_cq && !req->no_event) {
				event = psmx_cq_create_event(
						req->ep->recv_cq,
						req->recv.context,
						req->recv.buf,
						req->cq_flags,
						req->recv.len_received,
						0, /* data */
						0, /* tag */
						req->recv.len - req->recv.len_received,
						0 /* err */);
				if (event)
					psmx_cq_enqueue_event(req->ep->recv_cq, event);
				else
					err = -FI_ENOMEM;
			}

			if (req->ep->recv_cntr)
				psmx_cntr_inc(req->ep->recv_cntr);

			free(req);
		}

		if (eom || op_error) {
			rep_args[0].u32w0 = PSMX_AM_REP_SEND;
			rep_args[0].u32w1 = op_error;
			rep_args[1].u64 = args[1].u64;
			rep_args[2].u64 = 0; /* done */
			err = psm_am_reply_short(token, PSMX_AM_MSG_HANDLER,
					rep_args, 3, NULL, 0, 0,
					NULL, NULL );
		}
                break;

	case PSMX_AM_REP_SEND:
		req = (struct psmx_am_request *)(uintptr_t)args[1].u64;
		op_error = (int)args[0].u32w1;
		assert(req->op == PSMX_AM_REQ_SEND);

		if (args[2].u64) { /* more to send */
			req->send.peer_context = (void *)(uintptr_t)args[2].u64;

			/* psm_am_request_short() can't be called inside the handler.
			 * put the request into a queue and process it later.
			 */
			psmx_am_enqueue_send(req->ep->domain, req);
		}
		else { /* done */
			if (req->ep->send_cq && !req->no_event) {
				event = psmx_cq_create_event(
						req->ep->send_cq,
						req->send.context,
						req->send.buf,
						req->cq_flags,
						req->send.len,
						0, /* data */
						0, /* tag */
						0, /* olen */
						op_error);
				if (event)
					psmx_cq_enqueue_event(req->ep->send_cq, event);
				else
					err = -FI_ENOMEM;
			}

			if (req->ep->send_cntr)
				psmx_cntr_inc(req->ep->send_cntr);

			free(req);
		}
		break;

	default:
		err = -FI_EINVAL;
	}

	return err;
}
Пример #13
0
int psmx_am_msg_handler(psm_am_token_t token, psm_epaddr_t epaddr,
			psm_amarg_t *args, int nargs, void *src, uint32_t len)
{
        psm_amarg_t rep_args[8];
        struct psmx_am_request *req;
        struct psmx_cq_event *event;
	struct psmx_epaddr_context *epaddr_context;
	struct psmx_fid_domain *domain;
	int msg_len;
	int copy_len;
	uint64_t offset;
	int cmd, eom;
	int err = 0;
	int op_error = 0;
	struct psmx_unexp *unexp;

	epaddr_context = psm_epaddr_getctxt(epaddr);
	if (!epaddr_context) {
		fprintf(stderr, "%s: NULL context for epaddr %p\n", __func__, epaddr);
		return -EIO;
	}

	domain = epaddr_context->domain;

	cmd = args[0].u32w0 & PSMX_AM_OP_MASK;
	eom = args[0].u32w0 & PSMX_AM_EOM;

	switch (cmd) {
	case PSMX_AM_REQ_SEND:
		msg_len = args[0].u32w1;
                offset = args[3].u64;
                assert(len == msg_len);
		if (offset == 0) {
			/* this is the first packet */
			req = psmx_am_search_and_dequeue_recv(domain, (const void *)epaddr);
			if (req) {
				copy_len = MIN(len, req->recv.len);
				memcpy(req->recv.buf, src, len);
				req->recv.len_received += copy_len;
			}
			else {
				unexp = malloc(sizeof(*unexp) + len);
				if (!unexp) {
					op_error = -ENOBUFS;
				}
				else  {
					memcpy(unexp->buf, src, len);
					unexp->sender_addr = epaddr;
					unexp->sender_context = args[1].u64;
					unexp->len_received = len;
					unexp->done = !!eom;
					unexp->next = NULL;
					psmx_unexp_enqueue(unexp);

					if (!eom) {
						/* stop here. will reply when recv is posted */
						break;
					}
				}
			}

			if (!op_error && !eom) {
				/* reply w/ recv req to be used for following packets */
				rep_args[0].u32w0 = PSMX_AM_REP_SEND;
				rep_args[0].u32w1 = 0;
				rep_args[1].u64 = args[1].u64;
				rep_args[2].u64 = (uint64_t)(uintptr_t)req;
				err = psm_am_reply_short(token, PSMX_AM_MSG_HANDLER,
						rep_args, 3, NULL, 0, 0,
						NULL, NULL );
			}
		}
		else {
			req = (struct psmx_am_request *)(uintptr_t)args[2].u64;
			if (req) {
				copy_len = MIN(req->recv.len + offset, len);
				memcpy(req->recv.buf + offset, src, copy_len);
				req->recv.len_received += copy_len;
			}
			else {
				fprintf(stderr, "%s: NULL recv_req in follow-up packets.\n", __func__);
				op_error = -EBADMSG;
			}
		}

		if (eom && req) {
			if (req->ep->recv_cq && !req->no_event) {
				event = psmx_cq_create_event(
						req->ep->recv_cq,
						req->recv.context,
						req->recv.buf,
						0, /* flags */
						req->recv.len_received,
						0, /* data */
						0, /* tag */
						req->recv.len - req->recv.len_received,
						0 /* err */);
				if (event)
					psmx_cq_enqueue_event(req->ep->recv_cq, event);
				else
					err = -ENOMEM;
			}

			if (req->ep->recv_cntr)
				psmx_cntr_inc(req->ep->recv_cntr);

			free(req);
		}

		if (eom || op_error) {
			rep_args[0].u32w0 = PSMX_AM_REP_SEND;
			rep_args[0].u32w1 = op_error;
			rep_args[1].u64 = args[1].u64;
			rep_args[2].u64 = 0; /* done */
			err = psm_am_reply_short(token, PSMX_AM_MSG_HANDLER,
					rep_args, 3, NULL, 0, 0,
					NULL, NULL );
		}
                break;

	case PSMX_AM_REP_SEND:
		req = (struct psmx_am_request *)(uintptr_t)args[1].u64;
		op_error = (int)args[0].u32w1;
		assert(req->op == PSMX_AM_REQ_SEND);

		if (args[2].u64) { /* more to send */
			req->send.peer_context = (void *)(uintptr_t)args[2].u64;

#if PSMX_AM_USE_SEND_QUEUE
			/* psm_am_request_short() can't be called inside the handler.
			 * put the request into a queue and process it later.
			 */
			psmx_am_enqueue_send(req->ep->domain, req);
			if (req->ep->domain->progress_thread)
				pthread_cond_signal(&req->ep->domain->progress_cond);
#else
			req->send.peer_ready = 1;
#endif
		}
		else { /* done */
			if (req->ep->send_cq && !req->no_event) {
				event = psmx_cq_create_event(
						req->ep->send_cq,
						req->send.context,
						req->send.buf,
						0, /* flags */
						req->send.len,
						0, /* data */
						0, /* tag */
						0, /* olen */
						op_error);
				if (event)
					psmx_cq_enqueue_event(req->ep->send_cq, event);
				else
					err = -ENOMEM;
			}

			if (req->ep->send_cntr)
				psmx_cntr_inc(req->ep->send_cntr);

			if (req->state == PSMX_AM_STATE_QUEUED)
				req->state = PSMX_AM_STATE_DONE;
			else
				free(req);
		}
		break;

	default:
		err = -EINVAL;
	}

	return err;
}