Exemplo n.º 1
0
/*
 * Try our best to read one copy and read local first.
 *
 * Return success if any read succeed. We don't call gateway_forward_request()
 * because we only read once.
 */
int gateway_read_obj(struct request *req)
{
	int i, ret = SD_RES_SUCCESS;
	unsigned wlen, rlen;
	struct sd_req fwd_hdr;
	struct sd_rsp *rsp = (struct sd_rsp *)&fwd_hdr;
	struct sd_vnode *v;
	struct sd_vnode *obj_vnodes[SD_MAX_COPIES];
	uint64_t oid = req->rq.obj.oid;
	int nr_copies, j;

	if (is_object_cache_enabled() && !req->local && !bypass_object_cache(req))
		return object_cache_handle_request(req);

	nr_copies = get_req_copy_number(req);
	oid_to_vnodes(req->vinfo->vnodes, req->vinfo->nr_vnodes, oid,
		      nr_copies, obj_vnodes);
	for (i = 0; i < nr_copies; i++) {
		v = obj_vnodes[i];
		if (!vnode_is_local(v))
			continue;
		ret = peer_read_obj(req);
		if (ret == SD_RES_SUCCESS)
			return ret;

		eprintf("local read fail %x\n", ret);
		break;
	}

	/*
	 * Read random copy from cluster for better load balance, useful for
	 * reading base VM's COW objects
	 */
	j = random();
	for (i = 0; i < nr_copies; i++) {
		int idx = (i + j) % nr_copies;

		v = obj_vnodes[idx];
		if (vnode_is_local(v))
			continue;
		/*
		 * We need to re-init it because rsp and req share the same
		 * structure.
		 */
		gateway_init_fwd_hdr(&fwd_hdr, &req->rq);
		wlen = 0;
		rlen = fwd_hdr.data_length;
		ret = sheep_exec_req(&v->nid, &fwd_hdr, req->data, &wlen,
				     &rlen);
		if (ret != SD_RES_SUCCESS)
			continue;

		/* Read success */
		memcpy(&req->rp, rsp, sizeof(*rsp));
		break;
	}
	return ret;
}
Exemplo n.º 2
0
static int forward_read_obj_req(struct request *req)
{
	int i, fd, ret = SD_RES_SUCCESS;
	unsigned wlen, rlen;
	struct sd_obj_req hdr = *(struct sd_obj_req *)&req->rq;
	struct sd_obj_rsp *rsp = (struct sd_obj_rsp *)&hdr;
	struct sd_vnode *v;
	uint64_t oid = hdr.oid;
	int nr_copies;

	hdr.flags |= SD_FLAG_CMD_IO_LOCAL;

	if (hdr.copies)
		nr_copies = hdr.copies;
	else
		nr_copies = get_nr_copies(req->vnodes);

	/* TODO: we can do better; we need to check this first */
	for (i = 0; i < nr_copies; i++) {
		v = oid_to_vnode(req->vnodes, oid, i);
		if (vnode_is_local(v)) {
			ret = do_local_io(req, hdr.epoch);
			if (ret != SD_RES_SUCCESS)
				goto read_remote;
			return ret;
		}
	}

read_remote:
	for (i = 0; i < nr_copies; i++) {
		v = oid_to_vnode(req->vnodes, oid, i);
		if (vnode_is_local(v))
			continue;

		fd = get_sheep_fd(v->addr, v->port, v->node_idx, hdr.epoch);
		if (fd < 0) {
			ret = SD_RES_NETWORK_ERROR;
			continue;
		}

		wlen = 0;
		rlen = hdr.data_length;

		ret = exec_req(fd, (struct sd_req *)&hdr, req->data, &wlen, &rlen);

		if (ret) { /* network errors */
			del_sheep_fd(fd);
			ret = SD_RES_NETWORK_ERROR;
			continue;
		} else {
			memcpy(&req->rp, rsp, sizeof(*rsp));
			ret = rsp->result;
			break;
		}
	}
	return ret;
}
Exemplo n.º 3
0
static bool oid_stale(uint64_t oid)
{
	int i, nr_copies;
	struct vnode_info *vinfo;
	struct sd_vnode *v;
	bool ret = true;
	struct sd_vnode *obj_vnodes[SD_MAX_COPIES];

	vinfo = get_vnode_info();
	nr_copies = get_obj_copy_number(oid, vinfo->nr_zones);
	if (!nr_copies) {
		ret = false;
		goto out;
	}

	oid_to_vnodes(vinfo->vnodes, vinfo->nr_vnodes, oid,
		      nr_copies, obj_vnodes);
	for (i = 0; i < nr_copies; i++) {
		v = obj_vnodes[i];
		if (vnode_is_local(v)) {
			ret = false;
			break;
		}
	}
out:
	put_vnode_info(vinfo);
	return ret;
}
Exemplo n.º 4
0
/* Screen out objects that don't belong to this node */
static void screen_object_list(struct recovery_work *rw,
			       uint64_t *oids, int nr_oids)
{
	struct sd_vnode *vnodes[SD_MAX_COPIES];
	int old_count = rw->count;
	int nr_objs;
	int i, j;

	nr_objs = get_nr_copies(rw->cur_vnodes);
	for (i = 0; i < nr_oids; i++) {
		oid_to_vnodes(rw->cur_vnodes, oids[i], nr_objs, vnodes);
		for (j = 0; j < nr_objs; j++) {
			if (!vnode_is_local(vnodes[j]))
				continue;
			if (bsearch(&oids[i], rw->oids, old_count,
				    sizeof(uint64_t), obj_cmp))
				continue;

			rw->oids[rw->count++] = oids[i];
			break;
		}
	}

	qsort(rw->oids, rw->count, sizeof(uint64_t), obj_cmp);
}
Exemplo n.º 5
0
static int recover_object_from_replica(uint64_t oid,
				       const struct sd_vnode *vnode,
				       uint32_t epoch, uint32_t tgt_epoch)
{
	struct sd_req hdr;
	struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
	unsigned rlen;
	int ret = SD_RES_NO_MEM;
	void *buf = NULL;
	struct siocb iocb = { 0 };

	if (vnode_is_local(vnode)) {
		ret = sd_store->link(oid, tgt_epoch);
		goto out;
	}

	rlen = get_objsize(oid);
	buf = valloc(rlen);
	if (!buf) {
		sd_eprintf("%m");
		goto out;
	}

	sd_init_req(&hdr, SD_OP_READ_PEER);
	hdr.epoch = epoch;
	hdr.flags = SD_FLAG_CMD_RECOVERY;
	hdr.data_length = rlen;
	hdr.obj.oid = oid;
	hdr.obj.tgt_epoch = tgt_epoch;

	ret = sheep_exec_req(&vnode->nid, &hdr, buf);
	if (ret != SD_RES_SUCCESS)
		goto out;
	iocb.epoch = epoch;
	iocb.length = rsp->data_length;
	iocb.offset = rsp->obj.offset;
	iocb.buf = buf;
	ret = sd_store->create_and_write(oid, &iocb);
out:
	if (ret == SD_RES_SUCCESS) {
		sd_dprintf("recovered oid %"PRIx64" from %d to epoch %d", oid,
			tgt_epoch, epoch);
		objlist_cache_insert(oid);
	}
	free(buf);
	return ret;
}
Exemplo n.º 6
0
/* Screen out objects that don't belong to this node */
static void screen_object_list(struct recovery_work *rw,
			       uint64_t *oids, int nr_oids)
{
	struct sd_vnode *vnodes[SD_MAX_COPIES];
	int old_count = rw->count;
	int nr_objs;
	int i, j;

	for (i = 0; i < nr_oids; i++) {
again:
		nr_objs = get_obj_copy_number(oids[i], rw->cur_vinfo->nr_zones);
		if (!nr_objs) {
			dprintf("can not find copy number for object %" PRIx64
				"\n", oids[i]);
			dprintf("probably, vdi was created but "
				"post_cluster_new_vdi() is not called yet\n");
			/* FIXME: can we wait for post_cluster_new_vdi
			 *        with a better way? */
			sleep(1);
			goto again;
		}
		oid_to_vnodes(rw->cur_vinfo->vnodes, rw->cur_vinfo->nr_vnodes,
			      oids[i], nr_objs, vnodes);
		for (j = 0; j < nr_objs; j++) {
			if (!vnode_is_local(vnodes[j]))
				continue;
			if (bsearch(&oids[i], rw->oids, old_count,
				    sizeof(uint64_t), obj_cmp))
				continue;

			rw->oids[rw->count++] = oids[i];
			break;
		}
	}

	qsort(rw->oids, rw->count, sizeof(uint64_t), obj_cmp);
}
Exemplo n.º 7
0
/*
 * For replicated object, if any of the replica belongs to this node, we
 * consider it not stale.
 *
 * For erasured object, since every copy is unique and if it migrates to other
 * node(index gets changed even it has some other copy belongs to it) because
 * of hash ring changes, we consider it stale.
 */
static bool oid_stale(uint64_t oid, int ec_index, struct vnode_info *vinfo)
{
	uint32_t i, nr_copies;
	const struct sd_vnode *v;
	bool ret = true;
	const struct sd_vnode *obj_vnodes[SD_MAX_COPIES];

	nr_copies = get_obj_copy_number(oid, vinfo->nr_zones);
	oid_to_vnodes(oid, &vinfo->vroot, nr_copies, obj_vnodes);
	for (i = 0; i < nr_copies; i++) {
		v = obj_vnodes[i];
		if (vnode_is_local(v)) {
			if (ec_index < SD_MAX_COPIES) {
				if (i == ec_index)
					ret = false;
			} else {
				ret = false;
			}
			break;
		}
	}

	return ret;
}
Exemplo n.º 8
0
/* Screen out objects that don't belong to this node */
static void screen_object_list(struct recovery_work *rw,
			       uint64_t *oids, size_t nr_oids)
{
	const struct sd_vnode *vnodes[SD_MAX_COPIES];
	int old_count = rw->count;
	int nr_objs;
	int i, j;

	for (i = 0; i < nr_oids; i++) {
		nr_objs = get_obj_copy_number(oids[i], rw->cur_vinfo->nr_zones);
		if (!nr_objs) {
			sd_eprintf("ERROR: can not find copy number for object"
				   " %" PRIx64, oids[i]);
			continue;
		}
		oid_to_vnodes(rw->cur_vinfo->vnodes, rw->cur_vinfo->nr_vnodes,
			      oids[i], nr_objs, vnodes);
		for (j = 0; j < nr_objs; j++) {
			if (!vnode_is_local(vnodes[j]))
				continue;
			if (bsearch(&oids[i], rw->oids, old_count,
				    sizeof(uint64_t), obj_cmp))
				continue;

			rw->oids[rw->count++] = oids[i];
			/* enlarge the list buffer if full */
			if (rw->count == list_buffer_size / sizeof(uint64_t)) {
				list_buffer_size *= 2;
				rw->oids = xrealloc(rw->oids, list_buffer_size);
			}
			break;
		}
	}

	qsort(rw->oids, rw->count, sizeof(uint64_t), obj_cmp);
}
Exemplo n.º 9
0
int forward_write_obj_req(struct request *req)
{
	int i, fd, ret, pollret;
	unsigned wlen;
	char name[128];
	struct sd_obj_req hdr = *(struct sd_obj_req *)&req->rq;
	struct sd_obj_rsp *rsp = (struct sd_obj_rsp *)&req->rp;
	struct sd_vnode *v;
	uint64_t oid = hdr.oid;
	int nr_copies;
	struct pollfd pfds[SD_MAX_REDUNDANCY];
	int nr_fds, local = 0;

	dprintf("%"PRIx64"\n", oid);

	nr_fds = 0;
	memset(pfds, 0, sizeof(pfds));
	for (i = 0; i < ARRAY_SIZE(pfds); i++)
		pfds[i].fd = -1;

	hdr.flags |= SD_FLAG_CMD_IO_LOCAL;

	wlen = hdr.data_length;

	nr_copies = get_nr_copies(req->vnodes);
	for (i = 0; i < nr_copies; i++) {
		v = oid_to_vnode(req->vnodes, oid, i);

		addr_to_str(name, sizeof(name), v->addr, 0);

		if (vnode_is_local(v)) {
			local = 1;
			continue;
		}

		fd = get_sheep_fd(v->addr, v->port, v->node_idx, hdr.epoch);
		if (fd < 0) {
			eprintf("failed to connect to %s:%"PRIu32"\n", name, v->port);
			ret = SD_RES_NETWORK_ERROR;
			goto out;
		}

		ret = send_req(fd, (struct sd_req *)&hdr, req->data, &wlen);
		if (ret) { /* network errors */
			del_sheep_fd(fd);
			ret = SD_RES_NETWORK_ERROR;
			dprintf("fail %"PRIu32"\n", ret);
			goto out;
		}

		pfds[nr_fds].fd = fd;
		pfds[nr_fds].events = POLLIN;
		nr_fds++;
	}

	if (local) {
		ret = do_local_io(req, hdr.epoch);
		rsp->result = ret;

		if (nr_fds == 0) {
			eprintf("exit %"PRIu32"\n", ret);
			goto out;
		}

		if (rsp->result != SD_RES_SUCCESS) {
			eprintf("fail %"PRIu32"\n", ret);
			goto out;
		}
	}

	ret = SD_RES_SUCCESS;
again:
	pollret = poll(pfds, nr_fds, DEFAULT_SOCKET_TIMEOUT * 1000);
	if (pollret < 0) {
		if (errno == EINTR)
			goto again;

		ret = SD_RES_EIO;
	} else if (pollret == 0) { /* poll time out */
		eprintf("timeout\n");

		for (i = 0; i < nr_fds; i++)
			del_sheep_fd(pfds[i].fd);

		ret = SD_RES_NETWORK_ERROR;
		goto out;
	}

	for (i = 0; i < nr_fds; i++) {
		if (pfds[i].fd < 0)
			break;

		if (pfds[i].revents & POLLERR || pfds[i].revents & POLLHUP || pfds[i].revents & POLLNVAL) {
			del_sheep_fd(pfds[i].fd);
			ret = SD_RES_NETWORK_ERROR;
			break;
		}

		if (!(pfds[i].revents & POLLIN))
			continue;

		if (do_read(pfds[i].fd, rsp, sizeof(*rsp))) {
			eprintf("failed to read a response: %m\n");
			del_sheep_fd(pfds[i].fd);
			ret = SD_RES_NETWORK_ERROR;
			break;
		}

		if (rsp->result != SD_RES_SUCCESS) {
			eprintf("fail %"PRIu32"\n", rsp->result);
			ret = rsp->result;
		}

		break;
	}
	if (i < nr_fds) {
		nr_fds--;
		memmove(pfds + i, pfds + i + 1, sizeof(*pfds) * (nr_fds - i));
	}

	dprintf("%"PRIx64" %"PRIu32"\n", oid, nr_fds);

	if (nr_fds > 0) {
		goto again;
	}
out:
	return ret;
}
Exemplo n.º 10
0
static int recover_object_from_replica(uint64_t oid,
				       struct sd_vnode *entry,
				       uint32_t epoch, uint32_t tgt_epoch)
{
	struct sd_req hdr;
	struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
	char name[128];
	unsigned wlen = 0, rlen;
	int fd, ret = -1;
	void *buf;
	struct siocb iocb = { 0 };

	rlen = get_objsize(oid);

	buf = valloc(rlen);
	if (!buf) {
		eprintf("%m\n");
		goto out;
	}

	if (vnode_is_local(entry)) {
		iocb.epoch = epoch;
		iocb.length = rlen;
		ret = sd_store->link(oid, &iocb, tgt_epoch);
		if (ret == SD_RES_SUCCESS) {
			ret = 0;
			goto done;
		} else {
			ret = -1;
			goto out;
		}
	}

	addr_to_str(name, sizeof(name), entry->nid.addr, 0);
	fd = connect_to(name, entry->nid.port);
	dprintf("%s, %d\n", name, entry->nid.port);
	if (fd < 0) {
		eprintf("failed to connect to %s:%"PRIu32"\n", name, entry->nid.port);
		ret = -1;
		goto out;
	}

	sd_init_req(&hdr, SD_OP_READ_PEER);
	hdr.epoch = epoch;
	hdr.flags = SD_FLAG_CMD_RECOVERY;
	hdr.data_length = rlen;

	hdr.obj.oid = oid;
	hdr.obj.tgt_epoch = tgt_epoch;

	ret = exec_req(fd, &hdr, buf, &wlen, &rlen);

	close(fd);

	if (ret != 0) {
		eprintf("res: %"PRIx32"\n", rsp->result);
		ret = -1;
		goto out;
	}

	rsp = (struct sd_rsp *)&hdr;

	if (rsp->result == SD_RES_SUCCESS) {
		iocb.epoch = epoch;
		iocb.length = rlen;
		iocb.buf = buf;
		ret = sd_store->atomic_put(oid, &iocb);
		if (ret != SD_RES_SUCCESS) {
			ret = -1;
			goto out;
		}
	} else {
		eprintf("failed, res: %"PRIx32"\n", rsp->result);
		ret = rsp->result;
		goto out;
	}
done:
	dprintf("recovered oid %"PRIx64" from %d to epoch %d\n", oid, tgt_epoch, epoch);
out:
	if (ret == SD_RES_SUCCESS)
		objlist_cache_insert(oid);
	free(buf);
	return ret;
}
Exemplo n.º 11
0
/*
 * Try our best to read one copy and read local first.
 *
 * Return success if any read succeed. We don't call gateway_forward_request()
 * because we only read once.
 */
int gateway_read_obj(struct request *req)
{
	int i, ret = SD_RES_SUCCESS;
	struct sd_req fwd_hdr;
	struct sd_rsp *rsp = (struct sd_rsp *)&fwd_hdr;
	const struct sd_vnode *v;
	const struct sd_vnode *my_v;
	const struct sd_vnode *obj_vnodes[SD_MAX_COPIES];
	const struct sd_vnode *my_obj_vnodes[SD_MAX_COPIES];
	uint64_t oid = req->rq.obj.oid;
	int nr_copies, j;
	
	//PRINT_TO_LOG("WYH\n");
	if (sys->enable_object_cache && !req->local &&
	    !bypass_object_cache(req)) {
		ret = object_cache_handle_request(req);
		goto out;
	}

	nr_copies = get_req_copy_number(req);

	if (nr_copies == 0) {
		sd_debug("there is no living nodes");
		return SD_RES_HALT;
	}

	oid_to_vnodes(req->vinfo->vnodes, req->vinfo->nr_vnodes, oid,
		      nr_copies, obj_vnodes);
	//PRINT_TO_LOG("out:%s\n", addr_to_str(obj_vnodes[0]->nid.addr,obj_vnodes[0]->nid.port));
	
	oid_to_vnodes(req->vinfo->my_vnodes, req->vinfo->my_nr_vnodes, oid, nr_copies, my_obj_vnodes);
	my_v = my_obj_vnodes[0];
	//PRINT_TO_LOG("out:%s\n", addr_to_str(my_v->nid.addr,my_v->nid.port));
		
	for (i = 0; i < nr_copies; i++) {
		v = obj_vnodes[0];
		my_v = my_obj_vnodes[0];
		
		//PRINT_TO_LOG("phy:%s\n", addr_to_str(v->nid.addr,v->nid.port));
		//PRINT_TO_LOG("vir:%s\n", addr_to_str(my_v->nid.addr, my_v->nid.port));
		
		//PRINT_TO_LOG("%d,%d\n", req->vinfo->nr_vnodes, req->vinfo->my_nr_vnodes);

		if (!vnode_is_local(v))
			continue;
		ret = peer_read_obj(req);
		if (ret == SD_RES_SUCCESS)
			goto out;

		sd_err("local read %"PRIx64" failed, %s", oid,
		       sd_strerror(ret));
		break;
	}

	/*
	 * Read random copy from cluster for better load balance, useful for
	 * reading base VM's COW objects
	 */
	j = random();
	for (i = 0; i < nr_copies; i++) {
		int idx = (i + j) % nr_copies;

		v = obj_vnodes[idx];
		if (vnode_is_local(v))
			continue;
		/*
		 * We need to re-init it because rsp and req share the same
		 * structure.
		 */
		gateway_init_fwd_hdr(&fwd_hdr, &req->rq);
		ret = sheep_exec_req(&v->nid, &fwd_hdr, req->data);
		if (ret != SD_RES_SUCCESS)
			continue;

		/* Read success */
		memcpy(&req->rp, rsp, sizeof(*rsp));
		break;
	}
out:
	if (ret == SD_RES_SUCCESS &&
	    req->rq.proto_ver < SD_PROTO_VER_TRIM_ZERO_SECTORS) {
		/* the client doesn't support trimming zero bytes */
		untrim_zero_blocks(req->data, req->rp.obj.offset,
				   req->rp.data_length, req->rq.data_length);
		req->rp.data_length = req->rq.data_length;
		req->rp.obj.offset = 0;
	}
	return ret;
}