Пример #1
0
int default_create_and_write(uint64_t oid, struct siocb *iocb)
{
	char path[PATH_MAX], tmp_path[PATH_MAX];
	int flags = get_open_flags(oid, true);
	int ret, fd;
	uint32_t len = iocb->length;

	get_obj_path(oid, path);
	get_tmp_obj_path(oid, tmp_path);

	fd = open(tmp_path, flags, def_fmode);
	if (fd < 0) {
		if (errno == EEXIST)
			/* This happens if node membership changes during object
			 * creation; while gateway retries a CREATE request,
			 * recovery process could also recover the object at the
			 * same time.  They should try to write the same date,
			 * so it is okay to simply return success here. */
			dprintf("%s exists\n", tmp_path);
			return SD_RES_SUCCESS;

		eprintf("failed to open %s: %m\n", tmp_path);
		return err_to_sderr(oid, errno);
	}

	if (iocb->offset != 0 || iocb->length != get_objsize(oid)) {
		ret = prealloc(fd, get_objsize(oid));
		if (ret != SD_RES_SUCCESS)
			goto out;
	}

	ret = xpwrite(fd, iocb->buf, len, iocb->offset);
	if (ret != len) {
		eprintf("failed to write object. %m\n");
		ret = err_to_sderr(oid, errno);
		goto out;
	}

	ret = rename(tmp_path, path);
	if (ret < 0) {
		eprintf("failed to rename %s to %s: %m\n", tmp_path, path);
		ret = err_to_sderr(oid, errno);
		goto out;
	}
	dprintf("%"PRIx64"\n", oid);
	ret = SD_RES_SUCCESS;
out:
	if (ret != SD_RES_SUCCESS)
		unlink(tmp_path);
	close(fd);
	return ret;
}
Пример #2
0
/* Trim zero blocks of the beginning and end of the object. */
static int default_trim(int fd, uint64_t oid, const struct siocb *iocb,
			uint64_t *poffset, uint32_t *plen)
{
	trim_zero_blocks(iocb->buf, poffset, plen);

	if (iocb->offset < *poffset) {
		sd_debug("discard between %d, %ld, %" PRIx64, iocb->offset,
			 *poffset, oid);

		if (discard(fd, iocb->offset, *poffset) < 0)
			return -1;
	}

	if (*poffset + *plen < iocb->offset + iocb->length) {
		uint64_t end = iocb->offset + iocb->length;
		if (end == get_objsize(oid))
			/* This is necessary to punch the last block */
			end = round_up(end, BLOCK_SIZE);
		sd_debug("discard between %ld, %ld, %" PRIx64, *poffset + *plen,
			 end, oid);

		if (discard(fd, *poffset + *plen, end) < 0)
			return -1;
	}

	return 0;
}
Пример #3
0
size_t get_store_objsize(uint64_t oid)
{
	if (is_erasure_oid(oid)) {
		uint8_t policy = get_vdi_copy_policy(oid_to_vid(oid));
		int d;
		ec_policy_to_dp(policy, &d, NULL);
		return SD_DATA_OBJ_SIZE / d;
	}
	return get_objsize(oid);
}
Пример #4
0
static int farm_write(uint64_t oid, struct siocb *iocb, int create)
{
	int flags = def_open_flags, fd, ret = SD_RES_SUCCESS;
	char path[PATH_MAX];
	ssize_t size;

	if (iocb->epoch < sys_epoch()) {
		dprintf("%"PRIu32" sys %"PRIu32"\n", iocb->epoch, sys_epoch());
		return SD_RES_OLD_NODE_VER;
	}
	if (!is_data_obj(oid))
		flags &= ~O_DIRECT;

	if (create)
		flags |= O_CREAT | O_TRUNC;

	sprintf(path, "%s%016"PRIx64, obj_path, oid);
	fd = open(path, flags, def_fmode);
	if (fd < 0)
		return err_to_sderr(oid, errno);

	if (flock(fd, LOCK_EX) < 0) {
		ret = SD_RES_EIO;
		eprintf("%m\n");
		goto out;
	}
	if (create && !(iocb->flags & SD_FLAG_CMD_COW)) {
		ret = prealloc(fd, get_objsize(oid));
		if (ret != SD_RES_SUCCESS) {
			if (flock(fd, LOCK_UN) < 0) {
				ret = SD_RES_EIO;
				eprintf("%m\n");
				goto out;
			}
			goto out;
		}
	}
	size = xpwrite(fd, iocb->buf, iocb->length, iocb->offset);
	if (flock(fd, LOCK_UN) < 0) {
		ret = SD_RES_EIO;
		eprintf("%m\n");
		goto out;
	}
	if (size != iocb->length) {
		eprintf("%m\n");
		ret = SD_RES_EIO;
		goto out;
	}

	trunk_update_entry(oid);
out:
	close(fd);
	return ret;
}
Пример #5
0
size_t get_store_objsize(uint8_t copy_policy, uint64_t oid)
{
	if (is_vdi_obj(oid))
		return SD_INODE_SIZE;
	if (copy_policy != 0) {
		int d;

		ec_policy_to_dp(copy_policy, &d, NULL);
		return SD_DATA_OBJ_SIZE / d;
	}
	return get_objsize(oid);
}
Пример #6
0
static int recover_object_from_replica(uint64_t oid,
				       const struct sd_vnode *vnode,
				       uint32_t epoch, uint32_t tgt_epoch)
{
	struct sd_req hdr;
	struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
	unsigned rlen;
	int ret = SD_RES_NO_MEM;
	void *buf = NULL;
	struct siocb iocb = { 0 };

	if (vnode_is_local(vnode)) {
		ret = sd_store->link(oid, tgt_epoch);
		goto out;
	}

	rlen = get_objsize(oid);
	buf = valloc(rlen);
	if (!buf) {
		sd_eprintf("%m");
		goto out;
	}

	sd_init_req(&hdr, SD_OP_READ_PEER);
	hdr.epoch = epoch;
	hdr.flags = SD_FLAG_CMD_RECOVERY;
	hdr.data_length = rlen;
	hdr.obj.oid = oid;
	hdr.obj.tgt_epoch = tgt_epoch;

	ret = sheep_exec_req(&vnode->nid, &hdr, buf);
	if (ret != SD_RES_SUCCESS)
		goto out;
	iocb.epoch = epoch;
	iocb.length = rsp->data_length;
	iocb.offset = rsp->obj.offset;
	iocb.buf = buf;
	ret = sd_store->create_and_write(oid, &iocb);
out:
	if (ret == SD_RES_SUCCESS) {
		sd_dprintf("recovered oid %"PRIx64" from %d to epoch %d", oid,
			tgt_epoch, epoch);
		objlist_cache_insert(oid);
	}
	free(buf);
	return ret;
}
Пример #7
0
int default_get_hash(uint64_t oid, uint32_t epoch, uint8_t *sha1)
{
	int ret;
	void *buf;
	struct siocb iocb = {};
	uint32_t length;
	bool is_readonly_obj = oid_is_readonly(oid);
	char path[PATH_MAX];

	ret = get_object_path(oid, epoch, path);
	if (ret != SD_RES_SUCCESS)
		return ret;

	if (is_readonly_obj) {
		if (get_object_sha1(path, sha1) == 0) {
			sd_debug("use cached sha1 digest %s",
				 sha1_to_hex(sha1));
			return SD_RES_SUCCESS;
		}
	}

	length = get_objsize(oid);
	buf = valloc(length);
	if (buf == NULL)
		return SD_RES_NO_MEM;

	iocb.epoch = epoch;
	iocb.buf = buf;
	iocb.length = length;

	ret = default_read_from_path(oid, path, &iocb);
	if (ret != SD_RES_SUCCESS) {
		free(buf);
		return ret;
	}

	sha1_from_buffer(buf, length, sha1);
	free(buf);

	sd_debug("the message digest of %"PRIx64" at epoch %d is %s", oid,
		 epoch, sha1_to_hex(sha1));

	if (is_readonly_obj)
		set_object_sha1(path, sha1);

	return ret;
}
Пример #8
0
size_t get_store_objsize(uint8_t copy_policy, uint8_t block_size_shift,
			 uint64_t oid)
{
	if (is_vdi_obj(oid))
		return SD_INODE_SIZE;
	if (is_vdi_btree_obj(oid))
		return SD_INODE_DATA_INDEX_SIZE;

	uint32_t object_size = (UINT32_C(1) << block_size_shift);
	if (copy_policy != 0) {
		int d;

		ec_policy_to_dp(copy_policy, &d, NULL);
		return object_size / d;
	}
	return get_objsize(oid, object_size);
}
Пример #9
0
int default_create_and_write(uint64_t oid, const struct siocb *iocb)
{
	char path[PATH_MAX], tmp_path[PATH_MAX];
	int flags = prepare_iocb(oid, iocb, true);
	int ret, fd;
	uint32_t len = iocb->length;
	size_t obj_size;
	uint64_t offset = iocb->offset;

	sd_debug("%"PRIx64, oid);
	get_store_path(oid, iocb->ec_index, path);
	get_store_tmp_path(oid, iocb->ec_index, tmp_path);

	if (uatomic_is_true(&sys->use_journal) &&
	    journal_write_store(oid, iocb->buf, iocb->length,
				iocb->offset, true)
	    != SD_RES_SUCCESS) {
		sd_err("turn off journaling");
		uatomic_set_false(&sys->use_journal);
		flags |= O_DSYNC;
		sync();
	}

	fd = open(tmp_path, flags, sd_def_fmode);
	if (fd < 0) {
		if (errno == EEXIST) {
			/*
			 * This happens if node membership changes during object
			 * creation; while gateway retries a CREATE request,
			 * recovery process could also recover the object at the
			 * same time.  They should try to write the same date,
			 * so it is okay to simply return success here.
			 */
			sd_debug("%s exists", tmp_path);
			return SD_RES_SUCCESS;
		}

		sd_err("failed to open %s: %m", tmp_path);
		return err_to_sderr(path, oid, errno);
	}

	obj_size = get_store_objsize(oid);

	trim_zero_blocks(iocb->buf, &offset, &len);

	if (offset != 0 || len != get_objsize(oid)) {
		if (is_sparse_object(oid))
			ret = xftruncate(fd, obj_size);
		else
			ret = prealloc(fd, obj_size);
		if (ret < 0) {
			ret = err_to_sderr(path, oid, errno);
			goto out;
		}
	}

	ret = xpwrite(fd, iocb->buf, len, offset);
	if (ret != len) {
		sd_err("failed to write object. %m");
		ret = err_to_sderr(path, oid, errno);
		goto out;
	}

	ret = rename(tmp_path, path);
	if (ret < 0) {
		sd_err("failed to rename %s to %s: %m", tmp_path, path);
		ret = err_to_sderr(path, oid, errno);
		goto out;
	}

	ret = SD_RES_SUCCESS;
	objlist_cache_insert(oid);
out:
	if (ret != SD_RES_SUCCESS)
		unlink(tmp_path);
	close(fd);
	return ret;
}
Пример #10
0
static int recover_object_from_replica(uint64_t oid,
				       struct sd_vnode *entry,
				       uint32_t epoch, uint32_t tgt_epoch)
{
	struct sd_req hdr;
	struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
	char name[128];
	unsigned wlen = 0, rlen;
	int fd, ret = -1;
	void *buf;
	struct siocb iocb = { 0 };

	rlen = get_objsize(oid);

	buf = valloc(rlen);
	if (!buf) {
		eprintf("%m\n");
		goto out;
	}

	if (vnode_is_local(entry)) {
		iocb.epoch = epoch;
		iocb.length = rlen;
		ret = sd_store->link(oid, &iocb, tgt_epoch);
		if (ret == SD_RES_SUCCESS) {
			ret = 0;
			goto done;
		} else {
			ret = -1;
			goto out;
		}
	}

	addr_to_str(name, sizeof(name), entry->nid.addr, 0);
	fd = connect_to(name, entry->nid.port);
	dprintf("%s, %d\n", name, entry->nid.port);
	if (fd < 0) {
		eprintf("failed to connect to %s:%"PRIu32"\n", name, entry->nid.port);
		ret = -1;
		goto out;
	}

	sd_init_req(&hdr, SD_OP_READ_PEER);
	hdr.epoch = epoch;
	hdr.flags = SD_FLAG_CMD_RECOVERY;
	hdr.data_length = rlen;

	hdr.obj.oid = oid;
	hdr.obj.tgt_epoch = tgt_epoch;

	ret = exec_req(fd, &hdr, buf, &wlen, &rlen);

	close(fd);

	if (ret != 0) {
		eprintf("res: %"PRIx32"\n", rsp->result);
		ret = -1;
		goto out;
	}

	rsp = (struct sd_rsp *)&hdr;

	if (rsp->result == SD_RES_SUCCESS) {
		iocb.epoch = epoch;
		iocb.length = rlen;
		iocb.buf = buf;
		ret = sd_store->atomic_put(oid, &iocb);
		if (ret != SD_RES_SUCCESS) {
			ret = -1;
			goto out;
		}
	} else {
		eprintf("failed, res: %"PRIx32"\n", rsp->result);
		ret = rsp->result;
		goto out;
	}
done:
	dprintf("recovered oid %"PRIx64" from %d to epoch %d\n", oid, tgt_epoch, epoch);
out:
	if (ret == SD_RES_SUCCESS)
		objlist_cache_insert(oid);
	free(buf);
	return ret;
}
Пример #11
0
int default_create_and_write(uint64_t oid, const struct siocb *iocb)
{
	char path[PATH_MAX], tmp_path[PATH_MAX];
	int flags = get_open_flags(oid, true, iocb->flags);
	int ret, fd;
	uint32_t len = iocb->length;

	get_obj_path(oid, path);
	get_tmp_obj_path(oid, tmp_path);

	if (uatomic_is_true(&sys->use_journal) &&
	    journal_file_write(oid, iocb->buf, iocb->length, iocb->offset, true)
	    != SD_RES_SUCCESS) {
		sd_eprintf("turn off journaling");
		uatomic_set_false(&sys->use_journal);
		flags |= O_DSYNC;
		sync();
	}

	fd = open(tmp_path, flags, def_fmode);
	if (fd < 0) {
		if (errno == EEXIST) {
			/*
			 * This happens if node membership changes during object
			 * creation; while gateway retries a CREATE request,
			 * recovery process could also recover the object at the
			 * same time.  They should try to write the same date,
			 * so it is okay to simply return success here.
			 */
			sd_dprintf("%s exists", tmp_path);
			return SD_RES_SUCCESS;
		}

		sd_eprintf("failed to open %s: %m", tmp_path);
		return err_to_sderr(oid, errno);
	}

	if (iocb->offset != 0 || iocb->length != get_objsize(oid)) {
		ret = prealloc(fd, get_objsize(oid));
		if (ret < 0) {
			ret = err_to_sderr(oid, errno);
			goto out;
		}
	}

	ret = xpwrite(fd, iocb->buf, len, iocb->offset);
	if (ret != len) {
		sd_eprintf("failed to write object. %m");
		ret = err_to_sderr(oid, errno);
		goto out;
	}

	ret = rename(tmp_path, path);
	if (ret < 0) {
		sd_eprintf("failed to rename %s to %s: %m", tmp_path, path);
		ret = err_to_sderr(oid, errno);
		goto out;
	}
	sd_dprintf("%"PRIx64, oid);
	ret = SD_RES_SUCCESS;
out:
	if (ret != SD_RES_SUCCESS)
		unlink(tmp_path);
	close(fd);
	return ret;
}
Пример #12
0
			sd_debug("%s exists", tmp_path);
			return SD_RES_SUCCESS;
		}

		sd_err("failed to open %s: %m", tmp_path);
		return err_to_sderr(path, oid, errno);
	}

	if (ec) {
		uint8_t policy = iocb->copy_policy ?:
			get_vdi_copy_policy(oid_to_vid(oid));
		int d;
		ec_policy_to_dp(policy, &d, NULL);
		obj_size = SD_DATA_OBJ_SIZE / d;
	} else
		obj_size = get_objsize(oid);

	ret = prealloc(fd, obj_size);
	if (ret < 0) {
		ret = err_to_sderr(path, oid, errno);
		goto out;
	}

	ret = xpwrite(fd, iocb->buf, len, iocb->offset);
	if (ret != len) {
		sd_err("failed to write object. %m");
		ret = err_to_sderr(path, oid, errno);
		goto out;
	}

	ret = rename(tmp_path, path);