예제 #1
0
파일: plain_store.c 프로젝트: DLag/sheepdog
static int init_vdi_state(uint64_t oid, const char *wd, uint32_t epoch)
{
	int ret;
	struct sd_inode *inode = xzalloc(SD_INODE_HEADER_SIZE);
	struct siocb iocb = {
		.epoch = epoch,
		.buf = inode,
		.length = SD_INODE_HEADER_SIZE,
	};

	ret = default_read(oid, &iocb);
	if (ret != SD_RES_SUCCESS) {
		sd_err("failed to read inode header %" PRIx64 " %" PRId32
		       "wat %s", oid, epoch, wd);
		goto out;
	}

	add_vdi_state(oid_to_vid(oid), inode->nr_copies,
		      vdi_is_snapshot(inode), inode->copy_policy);
	atomic_set_bit(oid_to_vid(oid), sys->vdi_inuse);

	ret = SD_RES_SUCCESS;
out:
	free(inode);
	return ret;
}
예제 #2
0
파일: farm.c 프로젝트: yaekumo/sheepdog
static int init_sys_vdi_bitmap(char *path)
{
    DIR *dir;
    struct dirent *dent;

    dir = opendir(path);
    if (!dir) {
        vprintf(SDOG_ERR, "failed to open the working directory: %m\n");
        return -1;
    }

    vprintf(SDOG_INFO, "found the working directory %s\n", path);
    while ((dent = readdir(dir))) {
        uint64_t oid;

        if (!strcmp(dent->d_name, "."))
            continue;

        oid = strtoull(dent->d_name, NULL, 16);
        if (oid == 0 || oid == ULLONG_MAX)
            continue;

        if (!is_vdi_obj(oid))
            continue;

        vprintf(SDOG_DEBUG, "found the VDI object %" PRIx64 "\n", oid);

        set_bit(oid_to_vid(oid), sys->vdi_inuse);
    }
    closedir(dir);

    return 0;
}
예제 #3
0
static int init_vdi_copy_number(uint64_t oid)
{
	char path[PATH_MAX];
	int fd, flags = get_open_flags(oid, false), ret;
	struct sheepdog_inode *inode = xzalloc(sizeof(*inode));

	snprintf(path, sizeof(path), "%s%016" PRIx64, obj_path, oid);

	fd = open(path, flags);
	if (fd < 0) {
		eprintf("failed to open %s, %m\n", path);
		ret = SD_RES_EIO;
		goto out;
	}

	ret = xpread(fd, inode, SD_INODE_HEADER_SIZE, 0);
	if (ret != SD_INODE_HEADER_SIZE) {
		eprintf("failed to read inode header, path=%s, %m\n", path);
		ret = SD_RES_EIO;
		goto out;
	}

	add_vdi_copy_number(oid_to_vid(oid), inode->nr_copies);

	ret = SD_RES_SUCCESS;
out:
	free(inode);
	return SD_RES_SUCCESS;
}
예제 #4
0
/* Trim zero blocks of the beginning and end of the object. */
static int default_trim(int fd, uint64_t oid, const struct siocb *iocb,
			uint64_t *poffset, uint32_t *plen)
{
	trim_zero_blocks(iocb->buf, poffset, plen);

	if (iocb->offset < *poffset) {
		sd_debug("discard between %d, %ld, %016" PRIx64, iocb->offset,
			 *poffset, oid);

		if (discard(fd, iocb->offset, *poffset) < 0)
			return -1;
	}

	if (*poffset + *plen < iocb->offset + iocb->length) {
		uint64_t end = iocb->offset + iocb->length;
		uint32_t object_size = get_vdi_object_size(oid_to_vid(oid));
		if (end == get_objsize(oid, object_size))
			/* This is necessary to punch the last block */
			end = round_up(end, BLOCK_SIZE);
		sd_debug("discard between %ld, %ld, %016" PRIx64, *poffset + *plen,
			 end, oid);

		if (discard(fd, *poffset + *plen, end) < 0)
			return -1;
	}

	return 0;
}
예제 #5
0
static int init_vdi_state(uint64_t oid, const char *wd, uint32_t epoch)
{
	int ret;
	struct sd_inode *inode = xzalloc(SD_INODE_HEADER_SIZE);
	struct siocb iocb = {
		.epoch = epoch,
		.buf = inode,
		.length = SD_INODE_HEADER_SIZE,
	};
	char path[PATH_MAX];

	if (epoch == 0)
		get_store_path(oid, iocb.ec_index, path);
	else
		get_store_stale_path(oid, iocb.epoch, iocb.ec_index, path);

	ret = default_read_from_path(oid, path, &iocb);
	if (ret != SD_RES_SUCCESS) {
		sd_err("failed to read inode header %" PRIx64 " %" PRId32
		       "at %s", oid, epoch, path);
		goto out;
	}
	atomic_set_bit(oid_to_vid(oid), sys->vdi_inuse);
out:
	free(inode);
	return ret;
}
예제 #6
0
파일: plain_store.c 프로젝트: DLag/sheepdog
size_t get_store_objsize(uint64_t oid)
{
	if (is_erasure_oid(oid)) {
		uint8_t policy = get_vdi_copy_policy(oid_to_vid(oid));
		int d;
		ec_policy_to_dp(policy, &d, NULL);
		return SD_DATA_OBJ_SIZE / d;
	}
	return get_objsize(oid);
}
예제 #7
0
static int init_objlist_and_vdi_bitmap(uint64_t oid, void *arg)
{
	int ret;
	objlist_cache_insert(oid);

	if (is_vdi_obj(oid)) {
		vprintf(SDOG_DEBUG, "found the VDI object %" PRIx64 "\n", oid);
		set_bit(oid_to_vid(oid), sys->vdi_inuse);
		ret = init_vdi_copy_number(oid);
		if (ret != SD_RES_SUCCESS)
			return ret;
	}
	return SD_RES_SUCCESS;
}
예제 #8
0
static int init_objlist_and_vdi_bitmap(uint64_t oid, const char *wd,
				       uint32_t epoch, uint8_t ec_index,
				       struct vnode_info *vinfo,
				       void *arg)
{
	int ret = SD_RES_SUCCESS;
	objlist_cache_insert(oid);

	if (is_vdi_obj(oid)) {
		sd_debug("found the VDI object %" PRIx64" epoch %"PRIu32
			 " at %s", oid, epoch, wd);
		atomic_set_bit(oid_to_vid(oid), sys->vdi_inuse);
	}
	return ret;
}
예제 #9
0
int default_create_and_write(uint64_t oid, const struct siocb *iocb)
{
	char path[PATH_MAX], tmp_path[PATH_MAX];
	int flags = prepare_iocb(oid, iocb, true);
	int ret, fd;
	uint32_t len = iocb->length;
	bool ec = is_erasure_obj(oid, iocb->copy_policy);
	size_t obj_size;

	sd_debug("%"PRIx64, oid);
	get_obj_path(oid, path, sizeof(path));
	get_tmp_obj_path(oid, tmp_path, sizeof(tmp_path));

	if (uatomic_is_true(&sys->use_journal) &&
	    journal_write_store(oid, iocb->buf, iocb->length,
				iocb->offset, true)
	    != SD_RES_SUCCESS) {
		sd_err("turn off journaling");
		uatomic_set_false(&sys->use_journal);
		flags |= O_DSYNC;
		sync();
	}

	fd = open(tmp_path, flags, sd_def_fmode);
	if (fd < 0) {
		if (errno == EEXIST) {
			/*
			 * This happens if node membership changes during object
			 * creation; while gateway retries a CREATE request,
			 * recovery process could also recover the object at the
			 * same time.  They should try to write the same date,
			 * so it is okay to simply return success here.
			 */
			sd_debug("%s exists", tmp_path);
			return SD_RES_SUCCESS;
		}

		sd_err("failed to open %s: %m", tmp_path);
		return err_to_sderr(path, oid, errno);
	}

	if (ec) {
		uint8_t policy = iocb->copy_policy ?:
			get_vdi_copy_policy(oid_to_vid(oid));
		int d;
		ec_policy_to_dp(policy, &d, NULL);
		obj_size = SD_DATA_OBJ_SIZE / d;
	} else
예제 #10
0
파일: store.c 프로젝트: zplambert/sheepdog
int remove_object(uint64_t oid)
{
	struct sd_req hdr;
	int ret;

	sd_init_req(&hdr, SD_OP_REMOVE_OBJ);
	hdr.obj.oid = oid;
	hdr.obj.copies = get_vdi_copy_number(oid_to_vid(oid));

	ret = exec_local_req(&hdr, NULL);
	if (ret != SD_RES_SUCCESS)
		sd_eprintf("failed to remove object %" PRIx64 ", %s", oid,
			   sd_strerror(ret));

	return ret;
}
예제 #11
0
파일: gateway.c 프로젝트: kentwei/sheepdog
static void finish_requests(struct request *req, struct req_iter *reqs,
			    int nr_to_send)
{
	uint64_t oid = req->rq.obj.oid;
	uint32_t len = req->rq.data_length;
	uint64_t off = req->rq.obj.offset;
	int opcode = req->rq.opcode;
	int start = off / SD_EC_DATA_STRIPE_SIZE;
	int end = DIV_ROUND_UP(off + len, SD_EC_DATA_STRIPE_SIZE), i, j;
	int nr_stripe = end - start;

	if (!is_erasure_oid(oid))
		goto out;

	sd_debug("start %d, end %d, send %d, off %"PRIu64 ", len %"PRIu32,
		 start, end, nr_to_send, off, len);

	/* We need to assemble the data strips into the req buffer for read */
	if (opcode == SD_OP_READ_OBJ) {
		char *p, *buf;
		uint8_t policy = req->rq.obj.copy_policy ?:
			get_vdi_copy_policy(oid_to_vid(req->rq.obj.oid));
		int ed = 0, strip_size;

		buf = malloc(SD_EC_DATA_STRIPE_SIZE * nr_stripe);
		if(unlikely(!buf)) {
			goto out;
		}

		ec_policy_to_dp(policy, &ed, NULL);
		strip_size = SD_EC_DATA_STRIPE_SIZE / ed;

		p = buf;
		for (i = 0; i < nr_stripe; i++) {
			for (j = 0; j < nr_to_send; j++) {
				memcpy(p, reqs[j].buf + strip_size * i,
				       strip_size);
				p += strip_size;
			}
		}
		memcpy(req->data, buf + off % SD_EC_DATA_STRIPE_SIZE, len);
		req->rp.data_length = req->rq.data_length;
		free(buf);
	}
예제 #12
0
파일: store.c 프로젝트: zplambert/sheepdog
int read_backend_object(uint64_t oid, char *data, unsigned int datalen,
			uint64_t offset)
{
	struct sd_req hdr;
	struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
	int ret;

	sd_init_req(&hdr, SD_OP_READ_OBJ);
	hdr.data_length = datalen;
	hdr.obj.oid = oid;
	hdr.obj.offset = offset;
	hdr.obj.copies = get_vdi_copy_number(oid_to_vid(oid));

	ret = exec_local_req(&hdr, data);
	if (ret != SD_RES_SUCCESS)
		sd_eprintf("failed to read object %" PRIx64 ", %s", oid,
			   sd_strerror(ret));

	untrim_zero_blocks(data, rsp->obj.offset, rsp->data_length, datalen);

	return ret;
}
예제 #13
0
파일: store.c 프로젝트: zplambert/sheepdog
/* Write data to both local object cache (if enabled) and backends */
int write_object(uint64_t oid, char *data, unsigned int datalen,
		 uint64_t offset, bool create)
{
	struct sd_req hdr;
	int ret;

	if (sys->enable_object_cache && object_is_cached(oid)) {
		ret = object_cache_write(oid, data, datalen, offset,
					 create);
		if (ret == SD_RES_NO_CACHE)
			goto forward_write;

		if (ret != 0) {
			sd_eprintf("write cache failed %"PRIx64" %"PRIx32, oid,
				   ret);
			return ret;
		}
	}

forward_write:
	if (create)
		sd_init_req(&hdr, SD_OP_CREATE_AND_WRITE_OBJ);
	else
		sd_init_req(&hdr, SD_OP_WRITE_OBJ);
	hdr.flags = SD_FLAG_CMD_WRITE;
	hdr.data_length = datalen;

	hdr.obj.oid = oid;
	hdr.obj.offset = offset;
	hdr.obj.copies = get_vdi_copy_number(oid_to_vid(oid));

	ret = exec_local_req(&hdr, data);
	if (ret != SD_RES_SUCCESS)
		sd_eprintf("failed to write object %" PRIx64 ", %s", oid,
			   sd_strerror(ret));

	return ret;
}
예제 #14
0
bool is_erasure_oid(uint64_t oid)
{
	return !is_vdi_obj(oid) && !is_vdi_btree_obj(oid) &&
		!is_ledger_object(oid) &&
		get_vdi_copy_policy(oid_to_vid(oid)) > 0;
}
예제 #15
0
/*
 * We spread data strips of req along with its parity strips onto replica for
 * write operation. For read we only need to prepare data strip buffers.
 */
static struct req_iter *prepare_erasure_requests(struct request *req, int *nr)
{
	uint32_t len = req->rq.data_length;
	uint64_t off = req->rq.obj.offset;
	int opcode = req->rq.opcode;
	int start = off / SD_EC_DATA_STRIPE_SIZE;
	int end = DIV_ROUND_UP(off + len, SD_EC_DATA_STRIPE_SIZE), i, j;
	int nr_stripe = end - start;
	struct fec *ctx;
	int strip_size, nr_to_send;
	struct req_iter *reqs;
	char *p, *buf = NULL;
	uint8_t policy = req->rq.obj.copy_policy ?:
		get_vdi_copy_policy(oid_to_vid(req->rq.obj.oid));
	int ed = 0, ep = 0, edp;

	edp = ec_policy_to_dp(policy, &ed, &ep);
	ctx = ec_init(ed, edp);
	*nr = nr_to_send = (opcode == SD_OP_READ_OBJ) ? ed : edp;
	strip_size = SD_EC_DATA_STRIPE_SIZE / ed;
	reqs = xzalloc(sizeof(*reqs) * nr_to_send);

	sd_debug("start %d, end %d, send %d, off %"PRIu64 ", len %"PRIu32,
		 start, end, nr_to_send, off, len);

	for (i = 0; i < nr_to_send; i++) {
		int l = strip_size * nr_stripe;

		reqs[i].buf = xmalloc(l);
		reqs[i].dlen = l;
		reqs[i].off = start * strip_size;
		switch (opcode) {
		case SD_OP_CREATE_AND_WRITE_OBJ:
		case SD_OP_WRITE_OBJ:
			reqs[i].wlen = l;
			break;
		default:
			break;
		}
	}

	if (opcode != SD_OP_WRITE_OBJ && opcode != SD_OP_CREATE_AND_WRITE_OBJ)
		goto out; /* Read and remove operation */

	p = buf = init_erasure_buffer(req, SD_EC_DATA_STRIPE_SIZE * nr_stripe);
	if (!buf) {
		sd_err("failed to init erasure buffer %"PRIx64,
		       req->rq.obj.oid);
		for (i = 0; i < nr_to_send; i++)
			free(reqs[i].buf);
		free(reqs);
		reqs = NULL;
		goto out;
	}
	for (i = 0; i < nr_stripe; i++) {
		const uint8_t *ds[ed];
		uint8_t *ps[ep];

		for (j = 0; j < ed; j++)
			ds[j] = reqs[j].buf + strip_size * i;

		for (j = 0; j < ep; j++)
			ps[j] = reqs[ed + j].buf + strip_size * i;

		for (j = 0; j < ed; j++)
			memcpy((uint8_t *)ds[j], p + j * strip_size,
			       strip_size);
		ec_encode(ctx, ds, ps);
		p += SD_EC_DATA_STRIPE_SIZE;
	}
out:
	ec_destroy(ctx);
	free(buf);

	return reqs;
}
예제 #16
0
int default_create_and_write(uint64_t oid, const struct siocb *iocb)
{
	char path[PATH_MAX], tmp_path[PATH_MAX], *dir;
	int flags = prepare_iocb(oid, iocb, true);
	int ret, fd;
	uint32_t len = iocb->length;
	uint32_t object_size = 0;
	size_t obj_size;
	uint64_t offset = iocb->offset;

	sd_debug("%016"PRIx64, oid);
	get_store_path(oid, iocb->ec_index, path);
	get_store_tmp_path(oid, iocb->ec_index, tmp_path);

	if (uatomic_is_true(&sys->use_journal) &&
	    journal_write_store(oid, iocb->buf, iocb->length,
				iocb->offset, true)
	    != SD_RES_SUCCESS) {
		sd_err("turn off journaling");
		uatomic_set_false(&sys->use_journal);
		flags |= O_SYNC;
		sync();
	}

	fd = open(tmp_path, flags, sd_def_fmode);
	if (fd < 0) {
		if (errno == EEXIST) {
			/*
			 * This happens if node membership changes during object
			 * creation; while gateway retries a CREATE request,
			 * recovery process could also recover the object at the
			 * same time.  They should try to write the same date,
			 * so it is okay to simply return success here.
			 */
			sd_debug("%s exists", tmp_path);
			return SD_RES_SUCCESS;
		}

		sd_err("failed to open %s: %m", tmp_path);
		return err_to_sderr(path, oid, errno);
	}

	obj_size = get_store_objsize(oid);

	trim_zero_blocks(iocb->buf, &offset, &len);

	object_size = get_vdi_object_size(oid_to_vid(oid));

	if (offset != 0 || len != get_objsize(oid, object_size)) {
		if (is_sparse_object(oid))
			ret = xftruncate(fd, obj_size);
		else
			ret = prealloc(fd, obj_size);
		if (ret < 0) {
			ret = err_to_sderr(path, oid, errno);
			goto out;
		}
	}

	ret = xpwrite(fd, iocb->buf, len, offset);
	if (ret != len) {
		sd_err("failed to write object. %m");
		ret = err_to_sderr(path, oid, errno);
		goto out;
	}

	ret = rename(tmp_path, path);
	if (ret < 0) {
		sd_err("failed to rename %s to %s: %m", tmp_path, path);
		ret = err_to_sderr(path, oid, errno);
		goto out;
	}

	close(fd);

	if (uatomic_is_true(&sys->use_journal) || sys->nosync == true) {
		objlist_cache_insert(oid);
		return SD_RES_SUCCESS;
	}

	pstrcpy(tmp_path, sizeof(tmp_path), path);
	dir = dirname(tmp_path);
	fd = open(dir, O_DIRECTORY | O_RDONLY);
	if (fd < 0) {
		sd_err("failed to open directory %s: %m", dir);
		return err_to_sderr(path, oid, errno);
	}

	if (fsync(fd) != 0) {
		sd_err("failed to write directory %s: %m", dir);
		ret = err_to_sderr(path, oid, errno);
		close(fd);
		if (unlink(path) != 0)
			sd_err("failed to unlink %s: %m", path);
		return ret;
	}
	close(fd);
	objlist_cache_insert(oid);
	return SD_RES_SUCCESS;

out:
	if (unlink(tmp_path) != 0)
		sd_err("failed to unlink %s: %m", tmp_path);
	close(fd);
	return ret;
}