Example #1
0
int default_write(uint64_t oid, struct siocb *iocb)
{
	int flags = get_open_flags(oid, false), fd, ret = SD_RES_SUCCESS;
	char path[PATH_MAX];
	ssize_t size;

	if (iocb->epoch < sys_epoch()) {
		dprintf("%"PRIu32" sys %"PRIu32"\n", iocb->epoch, sys_epoch());
		return SD_RES_OLD_NODE_VER;
	}

	get_obj_path(oid, path);
	if (iocb->flags & SD_FLAG_CMD_CACHE && is_disk_cache_enabled())
		flags &= ~O_DSYNC;
	fd = open(path, flags, def_fmode);
	if (fd < 0)
		return err_to_sderr(oid, errno);

	size = xpwrite(fd, iocb->buf, iocb->length, iocb->offset);
	if (size != iocb->length) {
		eprintf("failed to write object %"PRIx64", path=%s, offset=%"
			PRId64", size=%"PRId32", result=%zd, %m\n", oid, path,
			iocb->offset, iocb->length, size);
		ret = err_to_sderr(oid, errno);
		goto out;
	}
out:
	close(fd);
	return ret;
}
Example #2
0
int default_write(uint64_t oid, const struct siocb *iocb)
{
	int flags = prepare_iocb(oid, iocb, false), fd,
	    ret = SD_RES_SUCCESS;
	char path[PATH_MAX];
	ssize_t size;
	uint32_t len = iocb->length;
	uint64_t offset = iocb->offset;
	static bool trim_is_supported = true;

	if (iocb->epoch < sys_epoch()) {
		sd_debug("%"PRIu32" sys %"PRIu32, iocb->epoch, sys_epoch());
		return SD_RES_OLD_NODE_VER;
	}

	if (uatomic_is_true(&sys->use_journal) &&
	    unlikely(journal_write_store(oid, iocb->buf, iocb->length,
					 iocb->offset, false))
	    != SD_RES_SUCCESS) {
		sd_err("turn off journaling");
		uatomic_set_false(&sys->use_journal);
		flags |= O_DSYNC;
		sync();
	}

	get_store_path(oid, iocb->ec_index, path);

	/*
	 * Make sure oid is in the right place because oid might be misplaced
	 * in a wrong place, due to 'shutdown/restart with less/more disks' or
	 * any bugs. We need call err_to_sderr() to return EIO if disk is broken
	 */
	if (!default_exist(oid, iocb->ec_index))
		return err_to_sderr(path, oid, ENOENT);

	fd = open(path, flags, sd_def_fmode);
	if (unlikely(fd < 0))
		return err_to_sderr(path, oid, errno);

	if (trim_is_supported && is_sparse_object(oid)) {
		if (default_trim(fd, oid, iocb, &offset, &len) < 0) {
			trim_is_supported = false;
			offset = iocb->offset;
			len = iocb->length;
		}
	}

	size = xpwrite(fd, iocb->buf, len, offset);
	if (unlikely(size != len)) {
		sd_err("failed to write object %"PRIx64", path=%s, offset=%"
		       PRId32", size=%"PRId32", result=%zd, %m", oid, path,
		       iocb->offset, iocb->length, size);
		ret = err_to_sderr(path, oid, errno);
		goto out;
	}
out:
	close(fd);
	return ret;
}
Example #3
0
File: farm.c Project: yamt/sheepdog
static int farm_write(uint64_t oid, struct siocb *iocb, int create)
{
	int flags = def_open_flags, fd, ret = SD_RES_SUCCESS;
	char path[PATH_MAX];
	ssize_t size;

	if (iocb->epoch < sys_epoch()) {
		dprintf("%"PRIu32" sys %"PRIu32"\n", iocb->epoch, sys_epoch());
		return SD_RES_OLD_NODE_VER;
	}
	if (!is_data_obj(oid))
		flags &= ~O_DIRECT;

	if (create)
		flags |= O_CREAT | O_TRUNC;

	sprintf(path, "%s%016"PRIx64, obj_path, oid);
	fd = open(path, flags, def_fmode);
	if (fd < 0)
		return err_to_sderr(oid, errno);

	if (flock(fd, LOCK_EX) < 0) {
		ret = SD_RES_EIO;
		eprintf("%m\n");
		goto out;
	}
	if (create && !(iocb->flags & SD_FLAG_CMD_COW)) {
		ret = prealloc(fd, get_objsize(oid));
		if (ret != SD_RES_SUCCESS) {
			if (flock(fd, LOCK_UN) < 0) {
				ret = SD_RES_EIO;
				eprintf("%m\n");
				goto out;
			}
			goto out;
		}
	}
	size = xpwrite(fd, iocb->buf, iocb->length, iocb->offset);
	if (flock(fd, LOCK_UN) < 0) {
		ret = SD_RES_EIO;
		eprintf("%m\n");
		goto out;
	}
	if (size != iocb->length) {
		eprintf("%m\n");
		ret = SD_RES_EIO;
		goto out;
	}

	trunk_update_entry(oid);
out:
	close(fd);
	return ret;
}
Example #4
0
/*
 * Wait for all forward requests completion.
 *
 * Even if something goes wrong, we have to wait forward requests completion to
 * avoid interleaved requests.
 *
 * Return error code if any one request fails.
 */
static int wait_forward_request(struct write_info *wi, struct request *req)
{
	int nr_sent, err_ret = SD_RES_SUCCESS, ret, pollret, i;
	struct pfd_info pi;
	struct sd_rsp *rsp = &req->rp;
again:
	pfd_info_init(wi, &pi);
	pollret = poll(pi.pfds, pi.nr, 5000);
	if (pollret < 0) {
		if (errno == EINTR)
			goto again;

		panic("%m\n");
	} else if (pollret == 0) {
		eprintf("poll timeout %d\n", wi->nr_sent);

		if (req->rq.epoch == sys_epoch())
			goto again;

		nr_sent = wi->nr_sent;
		/* XXX Blinedly close all the connections */
		for (i = 0; i < nr_sent; i++)
			finish_one_write_err(wi, i);

		err_ret = SD_RES_NETWORK_ERROR;
		goto finish_write;
	}

	nr_sent = wi->nr_sent;
	for (i = 0; i < nr_sent; i++)
		if (pi.pfds[i].revents & POLLIN)
			break;
	if (i < nr_sent) {
		int re = pi.pfds[i].revents;
		dprintf("%d, revents %x\n", i, re);
		if (re & (POLLERR | POLLHUP | POLLNVAL)) {
			err_ret = SD_RES_NETWORK_ERROR;
			finish_one_write_err(wi, i);
			goto finish_write;
		}
		if (do_read(pi.pfds[i].fd, rsp, sizeof(*rsp))) {
			eprintf("remote node might have gone away\n");
			err_ret = SD_RES_NETWORK_ERROR;
			finish_one_write_err(wi, i);
			goto finish_write;
		}

		ret = rsp->result;
		if (ret != SD_RES_SUCCESS) {
			eprintf("fail %"PRIx32"\n", ret);
			err_ret = ret;
		}
		finish_one_write(wi, i);
	}
finish_write:
	if (wi->nr_sent > 0)
		goto again;

	return err_ret;
}
Example #5
0
File: farm.c Project: yamt/sheepdog
static int farm_link(uint64_t oid, struct siocb *iocb, uint32_t tgt_epoch)
{
	int ret = SD_RES_EIO;
	void *buf = NULL;
	struct siocb io = { 0 };
	int i;
	uint32_t epoch = sys_epoch();

	dprintf("try link %"PRIx64" from snapshot with epoch %d\n", oid, tgt_epoch);

	for (i = tgt_epoch; i < epoch; i++) {
		buf = retrieve_object_from_snap(oid, i);
		if (buf)
			break;
	}
	if (!buf)
		goto out;

	io.length = iocb->length;
	io.buf = buf;
	ret = farm_atomic_put(oid, &io);
out:
	free(buf);
	return ret;
}
Example #6
0
int snap_file_write(uint32_t epoch, unsigned char *trunksha1, unsigned char *outsha1, int user)
{
	int ret = 0;
	struct strbuf buf = STRBUF_INIT;
	struct sd_node nodes[SD_MAX_NODES];
	int tgt_epoch = user ? sys_epoch() : epoch;
	uint64_t epoch_size;
	struct sha1_file_hdr hdr;

	epoch_size = epoch_log_read(tgt_epoch, (char *)nodes, sizeof(nodes));
	if (epoch_size == -1)
		return -1;

	memcpy(hdr.tag, TAG_SNAP, TAG_LEN);
	hdr.size = epoch_size + SHA1_LEN;
	hdr.priv = tgt_epoch;
	hdr.reserved = 0;

	strbuf_add(&buf, &hdr, sizeof(hdr));
	strbuf_add(&buf, trunksha1, SHA1_LEN);
	strbuf_add(&buf, (char *)nodes, epoch_size);
	if (sha1_file_write((void *)buf.buf, buf.len, outsha1) < 0) {
		ret = -1;
		goto err;
	}

	dprintf("epoch %u, sha1: %s\n", epoch, sha1_to_hex(outsha1));
err:
	strbuf_release(&buf);
	return ret;
}
Example #7
0
int default_write(uint64_t oid, const struct siocb *iocb)
{
	int flags = get_open_flags(oid, false, iocb->flags), fd,
	    ret = SD_RES_SUCCESS;
	char path[PATH_MAX];
	ssize_t size;

	if (iocb->epoch < sys_epoch()) {
		sd_dprintf("%"PRIu32" sys %"PRIu32"\n",
			iocb->epoch, sys_epoch());
		return SD_RES_OLD_NODE_VER;
	}

	get_obj_path(oid, path);

	if (uatomic_is_true(&sys->use_journal) &&
	    journal_file_write(oid, iocb->buf, iocb->length, iocb->offset,
			       false)
	    != SD_RES_SUCCESS) {
		sd_eprintf("turn off journaling\n");
		uatomic_set_false(&sys->use_journal);
		flags |= O_DSYNC;
		sync();
	}

	fd = open(path, flags, def_fmode);
	if (fd < 0)
		return err_to_sderr(oid, errno);

	size = xpwrite(fd, iocb->buf, iocb->length, iocb->offset);
	if (size != iocb->length) {
		sd_eprintf("failed to write object %"PRIx64", path=%s, offset=%"
			PRId64", size=%"PRId32", result=%zd, %m\n", oid, path,
			iocb->offset, iocb->length, size);
		ret = err_to_sderr(oid, errno);
		goto out;
	}
out:
	close(fd);
	return ret;
}
Example #8
0
int default_write(uint64_t oid, const struct siocb *iocb)
{
	int flags = prepare_iocb(oid, iocb, false), fd,
	    ret = SD_RES_SUCCESS;
	char path[PATH_MAX];
	ssize_t size;

	if (iocb->epoch < sys_epoch()) {
		sd_debug("%"PRIu32" sys %"PRIu32, iocb->epoch, sys_epoch());
		return SD_RES_OLD_NODE_VER;
	}

	if (uatomic_is_true(&sys->use_journal) &&
	    unlikely(journal_write_store(oid, iocb->buf, iocb->length,
					 iocb->offset, false))
	    != SD_RES_SUCCESS) {
		sd_err("turn off journaling");
		uatomic_set_false(&sys->use_journal);
		flags |= O_DSYNC;
		sync();
	}

	get_obj_path(oid, path);

	fd = open(path, flags, sd_def_fmode);
	if (unlikely(fd < 0))
		return err_to_sderr(path, oid, errno);

	size = xpwrite(fd, iocb->buf, iocb->length, iocb->offset);
	if (unlikely(size != iocb->length)) {
		sd_err("failed to write object %"PRIx64", path=%s, offset=%"
		       PRId64", size=%"PRId32", result=%zd, %m", oid, path,
		       iocb->offset, iocb->length, size);
		ret = err_to_sderr(path, oid, errno);
		goto out;
	}
out:
	close(fd);
	return ret;
}
Example #9
0
int default_write(uint64_t oid, const struct siocb *iocb)
{
	int flags = prepare_iocb(oid, iocb, false), fd,
	    ret = SD_RES_SUCCESS;
	char path[PATH_MAX];
	ssize_t size;

	if (iocb->epoch < sys_epoch()) {
		sd_debug("%"PRIu32" sys %"PRIu32, iocb->epoch, sys_epoch());
		return SD_RES_OLD_NODE_VER;
	}

	get_store_path(oid, iocb->ec_index, path);

	/*
	 * Make sure oid is in the right place because oid might be misplaced
	 * in a wrong place, due to 'shutdown/restart with less/more disks' or
	 * any bugs. We need call err_to_sderr() to return EIO if disk is broken
	 */
	if (!default_exist(oid, iocb->ec_index))
		return err_to_sderr(path, oid, errno);

	fd = open(path, flags, sd_def_fmode);
	if (unlikely(fd < 0))
		return err_to_sderr(path, oid, errno);

	size = xpwrite(fd, iocb->buf, iocb->length, iocb->offset);
	if (unlikely(size != iocb->length)) {
		sd_err("failed to write object %"PRIx64", path=%s, offset=%"
		       PRId32", size=%"PRId32", result=%zd, %m", oid, path,
		       iocb->offset, iocb->length, size);
		ret = err_to_sderr(path, oid, errno);
		goto out;
	}
out:
	close(fd);
	return ret;
}
Example #10
0
static int recover_object_from_replica(uint64_t oid,
				       const struct sd_vnode *vnode,
				       uint32_t epoch, uint32_t tgt_epoch)
{
	struct sd_req hdr;
	struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
	unsigned rlen;
	int ret = SD_RES_NO_MEM;
	void *buf = NULL;
	struct siocb iocb = { 0 };

	if (vnode_is_local(vnode) && tgt_epoch < sys_epoch()) {
		ret = sd_store->link(oid, tgt_epoch);
		goto out;
	}

	rlen = get_objsize(oid);
	buf = valloc(rlen);
	if (!buf) {
		sd_eprintf("%m");
		goto out;
	}

	sd_init_req(&hdr, SD_OP_READ_PEER);
	hdr.epoch = epoch;
	hdr.flags = SD_FLAG_CMD_RECOVERY;
	hdr.data_length = rlen;
	hdr.obj.oid = oid;
	hdr.obj.tgt_epoch = tgt_epoch;

	ret = sheep_exec_req(&vnode->nid, &hdr, buf);
	if (ret != SD_RES_SUCCESS)
		goto out;
	iocb.epoch = epoch;
	iocb.length = rsp->data_length;
	iocb.offset = rsp->obj.offset;
	iocb.buf = buf;
	ret = sd_store->create_and_write(oid, &iocb);
out:
	if (ret == SD_RES_SUCCESS) {
		sd_dprintf("recovered oid %"PRIx64" from %d to epoch %d", oid,
			tgt_epoch, epoch);
		objlist_cache_insert(oid);
	}
	free(buf);
	return ret;
}
Example #11
0
int default_read(uint64_t oid, struct siocb *iocb)
{
	int ret;
	char path[PATH_MAX];
	uint32_t epoch = sys_epoch();

	get_obj_path(oid, path);
	ret = default_read_from_path(oid, path, iocb);

	/* If the request is againt the older epoch, try to read from
	 * the stale directory */
	while (ret == SD_RES_NO_OBJ && iocb->epoch < epoch) {
		epoch--;
		get_stale_obj_path(oid, epoch, path);
		ret = default_read_from_path(oid, path, iocb);
	}

	return ret;
}
Example #12
0
int default_read(uint64_t oid, const struct siocb *iocb)
{
	int ret;
	char path[PATH_MAX];

	get_store_path(oid, iocb->ec_index, path);
	ret = default_read_from_path(oid, path, iocb);

	/*
	 * If the request is againt the older epoch, try to read from
	 * the stale directory
	 */
	if (ret == SD_RES_NO_OBJ && iocb->epoch > 0 &&
	    iocb->epoch < sys_epoch()) {
		get_store_stale_path(oid, iocb->epoch, iocb->ec_index, path);
		ret = default_read_from_path(oid, path, iocb);
	}

	return ret;
}
Example #13
0
/* Fetch the object list from all the nodes in the cluster */
static uint64_t *fetch_object_list(struct sd_node *e, uint32_t epoch,
				   size_t *nr_oids)
{
	char name[128];
	struct sd_list_req hdr;
	struct sd_list_rsp *rsp = (struct sd_list_rsp *)&hdr;
	size_t buf_size = list_buffer_size;
	uint64_t *buf = xmalloc(buf_size);
	int ret;

	addr_to_str(name, sizeof(name), e->nid.addr, 0);
	sd_dprintf("%s %"PRIu32, name, e->nid.port);

retry:
	sd_init_req((struct sd_req *)&hdr, SD_OP_GET_OBJ_LIST);
	hdr.tgt_epoch = epoch - 1;
	hdr.data_length = buf_size;
	hdr.epoch = sys_epoch();
	ret = sheep_exec_req(&e->nid, (struct sd_req *)&hdr, buf);

	switch (ret) {
	case SD_RES_SUCCESS:
		break;
	case SD_RES_BUFFER_SMALL:
		buf_size *= 2;
		buf = xrealloc(buf, buf_size);
		goto retry;
	default:
		free(buf);
		return NULL;
	}

	*nr_oids = rsp->data_length / sizeof(uint64_t);
	sd_dprintf("%zu", *nr_oids);
	return buf;
}
Example #14
0
bool sheep_need_retry(uint32_t epoch)
{
	return sys_epoch() == epoch;
}
Example #15
0
File: farm.c Project: yamt/sheepdog
static int farm_read(uint64_t oid, struct siocb *iocb)
{
	int flags = def_open_flags, fd, ret = SD_RES_SUCCESS;
	uint32_t epoch = sys_epoch();
	char path[PATH_MAX];
	ssize_t size;
	int i;
	void *buffer;

	if (iocb->epoch < epoch) {

		buffer = read_working_object(oid, iocb->offset, iocb->length);
		if (!buffer) {
			/* Here if read the object from the targeted epoch failed,
			 * we need to read from the later epoch, because at some epoch
			 * we doesn't write the object to the snapshot, we assume
			 * it in the current local object directory, but maybe
			 * in the next epoch we removed it from the local directory.
			 * in this case, we should try to retrieve object upwards, since.
			 * when the object is to be removed, it will get written to the
			 * snapshot at later epoch.
			 */
			for (i = iocb->epoch; i < epoch; i++) {
				buffer = retrieve_object_from_snap(oid, i);
				if (buffer)
					break;
			}
		}
		if (!buffer)
			return SD_RES_NO_OBJ;
		memcpy(iocb->buf, buffer, iocb->length);
		free(buffer);

		return SD_RES_SUCCESS;
	}

	if (!is_data_obj(oid))
		flags &= ~O_DIRECT;

	sprintf(path, "%s%016"PRIx64, obj_path, oid);
	fd = open(path, flags);

	if (fd < 0)
		return err_to_sderr(oid, errno);

	if (flock(fd, LOCK_SH) < 0) {
		ret = SD_RES_EIO;
		eprintf("%m\n");
		goto out;
	}
	size = xpread(fd, iocb->buf, iocb->length, iocb->offset);
	if (flock(fd, LOCK_UN) < 0) {
		ret = SD_RES_EIO;
		eprintf("%m\n");
		goto out;
	}
	if (size != iocb->length) {
		ret = SD_RES_EIO;
		goto out;
	}
out:
	close(fd);
	return ret;
}