Exemple #1
0
int parse_vdi(vdi_parser_func_t func, size_t size, void *data)
{
	int ret;
	unsigned long nr;
	static struct sd_inode i;
	struct sd_req req;
	struct sd_rsp *rsp = (struct sd_rsp *)&req;
	static DECLARE_BITMAP(vdi_inuse, SD_NR_VDIS);
	unsigned int rlen = sizeof(vdi_inuse);

	sd_init_req(&req, SD_OP_READ_VDIS);
	req.data_length = sizeof(vdi_inuse);

	ret = dog_exec_req(sdhost, sdport, &req, &vdi_inuse);
	if (ret < 0)
		goto out;
	if (rsp->result != SD_RES_SUCCESS) {
		sd_err("%s", sd_strerror(rsp->result));
		goto out;
	}

	FOR_EACH_VDI(nr, vdi_inuse) {
		uint64_t oid;
		uint32_t snapid;

		oid = vid_to_vdi_oid(nr);

		memset(&i, 0, sizeof(i));
		ret = sd_read_object(oid, &i, SD_INODE_HEADER_SIZE, 0, true);
		if (ret != SD_RES_SUCCESS) {
			sd_err("Failed to read inode header");
			continue;
		}

		if (i.name[0] == '\0') /* this VDI has been deleted */
			continue;

		if (size > SD_INODE_HEADER_SIZE) {
			rlen = DIV_ROUND_UP(i.vdi_size, SD_DATA_OBJ_SIZE) *
				sizeof(i.data_vdi_id[0]);
			if (rlen > size - SD_INODE_HEADER_SIZE)
				rlen = size - SD_INODE_HEADER_SIZE;

			ret = sd_read_object(oid, ((char *)&i) + SD_INODE_HEADER_SIZE,
					     rlen, SD_INODE_HEADER_SIZE, true);

			if (ret != SD_RES_SUCCESS) {
				sd_err("Failed to read inode");
				continue;
			}
		}

		snapid = vdi_is_snapshot(&i) ? i.snap_id : 0;
		func(i.vdi_id, i.name, i.tag, snapid, 0, &i, data);
	}
Exemple #2
0
static void object_iterater(void *data, enum btree_node_type type, void *arg)
{
	struct sd_extent *ext;
	struct object_iterater_arg *oiarg = arg;
	struct kv_onode *onode = NULL;
	uint64_t oid;
	int ret;

	if (type == BTREE_EXT) {
		ext = (struct sd_extent *)data;
		if (!ext->vdi_id)
			goto out;

		onode = xmalloc(SD_DATA_OBJ_SIZE);
		oid = vid_to_data_oid(ext->vdi_id, ext->idx);
		ret = sd_read_object(oid, (char *)onode, SD_DATA_OBJ_SIZE, 0);
		if (ret != SD_RES_SUCCESS) {
			sd_err("Failed to read data object %"PRIx64, oid);
			goto out;
		}

		if (onode->name[0] == '\0')
			goto out;
		if (oiarg->cb)
			oiarg->cb(onode->name, oiarg->opaque);
		oiarg->count++;
	}
out:
	free(onode);
}
Exemple #3
0
static void bucket_iterater(struct sd_index *idx, void *arg, int ignore)
{
	struct bucket_iterater_arg *biarg = arg;
	struct kv_bnode bnode;
	uint64_t oid;
	int ret;

	if (!idx->vdi_id)
		return;

	oid = vid_to_data_oid(idx->vdi_id, idx->idx);
	ret = sd_read_object(oid, (char *)&bnode, sizeof(bnode), 0);
	if (ret != SD_RES_SUCCESS) {
		sd_err("Failed to read data object %"PRIx64, oid);
		return;
	}

	if (bnode.name[0] == 0)
		return;
	if (biarg->cb)
		biarg->cb(bnode.name, biarg->opaque);
	biarg->bucket_count++;
	biarg->object_count += bnode.object_count;
	biarg->bytes_used += bnode.bytes_used;
}
Exemple #4
0
static int read_vdi_obj(char *vdiname, int snapid, const char *tag,
			uint32_t *pvid, struct sheepdog_inode *inode,
			size_t size)
{
	int ret;
	uint32_t vid;

	ret = find_vdi_name(vdiname, snapid, tag, &vid, 0);
	if (ret < 0) {
		fprintf(stderr, "Failed to open VDI %s\n", vdiname);
		return EXIT_FAILURE;
	}

	ret = sd_read_object(vid_to_vdi_oid(vid), inode, size, 0, true);
	if (ret != SD_RES_SUCCESS) {
		if (snapid) {
			fprintf(stderr, "Failed to read a snapshot %s:%d\n",
				vdiname, snapid);
		} else if (tag && tag[0]) {
			fprintf(stderr, "Failed to read a snapshot %s:%s\n",
				vdiname, tag);
		} else {
			fprintf(stderr, "Failed to read a vdi %s\n", vdiname);
		}
		return EXIT_FAILURE;
	}

	if (pvid)
		*pvid = vid;

	return EXIT_SUCCESS;
}
Exemple #5
0
static int bnode_create(struct kv_bnode *bnode, uint32_t account_vid)
{
	struct sd_inode *inode = xmalloc(sizeof(struct sd_inode));
	uint32_t tmp_vid, idx;
	uint64_t hval, i;
	int ret;

	ret = sd_read_object(vid_to_vdi_oid(account_vid), (char *)inode,
			       sizeof(*inode), 0);
	if (ret != SD_RES_SUCCESS) {
		sd_err("failed to read %" PRIx32 " %s", account_vid,
		       sd_strerror(ret));
		goto out;
	}

	hval = sd_hash(bnode->name, strlen(bnode->name));
	for (i = 0; i < MAX_DATA_OBJS; i++) {
		idx = (hval + i) % MAX_DATA_OBJS;
		tmp_vid = INODE_GET_VID(inode, idx);
		if (tmp_vid)
			continue;
		else
			break;
	}
	if (i == MAX_DATA_OBJS) {
		ret = SD_RES_NO_SPACE;
		goto out;
	}
	ret = bnode_do_create(bnode, inode, idx);
out:
	free(inode);
	return ret;
}
Exemple #6
0
static int vdi_snapshot(int argc, char **argv)
{
	char *vdiname = argv[optind++];
	uint32_t vid;
	int ret;
	char buf[SD_INODE_HEADER_SIZE];
	struct sheepdog_inode *inode = (struct sheepdog_inode *)buf;

	if (vdi_cmd_data.snapshot_id != 0) {
		fprintf(stderr, "please specify a non-integer value for "
			"a snapshot tag name\n");
		return EXIT_USAGE;
	}

	ret = find_vdi_name(vdiname, 0, "", &vid, 0);
	if (ret < 0) {
		fprintf(stderr, "failed to open vdi %s\n", vdiname);
		return EXIT_FAILURE;
	}

	ret = sd_read_object(vid_to_vdi_oid(vid), inode, SD_INODE_HEADER_SIZE, 0);
	if (ret != SD_RES_SUCCESS) {
		fprintf(stderr, "failed to read an inode header\n");
		return EXIT_FAILURE;
	}

	if (vdi_cmd_data.snapshot_tag[0]) {
		ret = sd_write_object(vid_to_vdi_oid(vid), 0, vdi_cmd_data.snapshot_tag,
				      SD_MAX_VDI_TAG_LEN,
				      offsetof(struct sheepdog_inode, tag),
				      0, inode->nr_copies, 0);
	}
Exemple #7
0
static int read_account_meta(const char *account, uint64_t *bucket_count,
			     uint64_t *object_count, uint64_t *used)
{
	struct sd_inode *inode = NULL;
	struct bucket_iterater_arg arg = {};
	uint32_t account_vid;
	uint64_t oid;
	int ret;

	ret = sd_lookup_vdi(account, &account_vid);
	if (ret != SD_RES_SUCCESS)
		goto out;

	oid = vid_to_vdi_oid(account_vid);
	inode = xmalloc(sizeof(*inode));
	ret = sd_read_object(oid, (char *)inode, sizeof(struct sd_inode), 0);
	if (ret != SD_RES_SUCCESS) {
		sd_err("Failed to read inode header %"PRIx64, oid);
		goto out;
	}

	sd_inode_index_walk(inode, bucket_iterater, &arg);
	*object_count = arg.object_count;
	*bucket_count = arg.bucket_count;
	*used = arg.bytes_used;
out:
	free(inode);
	return ret;
}
Exemple #8
0
static void bucket_iterater(void *data, enum btree_node_type type, void *arg)
{
	struct sd_extent *ext;
	struct bucket_iterater_arg *biarg = arg;
	struct kv_bnode bnode;
	uint64_t oid;
	int ret;

	if (type == BTREE_EXT) {
		ext = (struct sd_extent *)data;
		if (!ext->vdi_id)
			return;

		oid = vid_to_data_oid(ext->vdi_id, ext->idx);
		ret = sd_read_object(oid, (char *)&bnode, sizeof(bnode), 0);
		if (ret != SD_RES_SUCCESS) {
			sd_err("Failed to read data object %"PRIx64, oid);
			return;
		}

		if (bnode.name[0] == 0)
			return;
		if (biarg->cb)
			biarg->cb(bnode.name, biarg->opaque);
		biarg->bucket_count++;
		biarg->object_count += bnode.object_count;
		biarg->bytes_used += bnode.bytes_used;
	}
}
Exemple #9
0
int kv_iterate_bucket(const char *account, bucket_iter_cb cb, void *opaque)
{
	struct sd_inode account_inode;
	struct bucket_iterater_arg arg = {opaque, cb, 0, 0, 0};
	uint32_t account_vid;
	uint64_t oid;
	int ret;

	ret = sd_lookup_vdi(account, &account_vid);
	if (ret != SD_RES_SUCCESS) {
		sd_err("Failed to find account %s", account);
		return ret;
	}

	oid = vid_to_vdi_oid(account_vid);
	sys->cdrv->lock(account_vid);
	ret = sd_read_object(oid, (char *)&account_inode,
			     sizeof(struct sd_inode), 0);
	if (ret != SD_RES_SUCCESS) {
		sd_err("Failed to read account inode header %s", account);
		goto out;
	}

	traverse_btree(sheep_bnode_reader, &account_inode,
		       bucket_iterater, &arg);
out:
	sys->cdrv->unlock(account_vid);
	return ret;
}
Exemple #10
0
static int onode_create(struct kv_onode *onode, uint32_t bucket_vid)
{
	struct sd_inode *inode = xmalloc(sizeof(struct sd_inode));
	uint32_t tmp_vid, idx;
	uint64_t hval, i;
	int ret;
	bool create = true;

	sys->cdrv->lock(bucket_vid);
	ret = sd_read_object(vid_to_vdi_oid(bucket_vid), (char *)inode,
			       sizeof(*inode), 0);
	if (ret != SD_RES_SUCCESS) {
		sd_err("failed to read %" PRIx32 " %s", bucket_vid,
		       sd_strerror(ret));
		goto out;
	}

	hval = sd_hash(onode->name, strlen(onode->name));
	for (i = 0; i < MAX_DATA_OBJS; i++) {
		idx = (hval + i) % MAX_DATA_OBJS;
		tmp_vid = INODE_GET_VID(inode, idx);
		if (tmp_vid) {
			uint64_t oid = vid_to_data_oid(bucket_vid, idx);
			char name[SD_MAX_OBJECT_NAME] = { };

			ret = sd_read_object(oid, name, sizeof(name), 0);
			if (ret != SD_RES_SUCCESS)
				goto out;
			if (name[0] == 0) {
				create = false;
				goto create;
			}
		} else
			break;
	}
	if (i == MAX_DATA_OBJS) {
		ret = SD_RES_NO_SPACE;
		goto out;
	}
create:
	ret = onode_do_create(onode, inode, idx, create);
out:
	free(inode);
	sys->cdrv->unlock(bucket_vid);
	return ret;
}
Exemple #11
0
/*
 * Check if object by name exists in a bucket and init 'onode' if it exists.
 *
 * Return SD_RES_SUCCESS if found, SD_RES_NO_OBJ if not found.
 *
 * We check adjacent objects one by one once we get a start index by hashing
 * name. Unallocated slot marks the end of the check window.
 *
 * For e.g, if we are going to check if fish in the following bucket, assume
 * fish hashes to 'sheep', so we compare the name one by one from 'sheep' to
 * 'fish'. '\0' indicates that object was deleted before checking.
 *
 * [ sheep, dog, wolve, '\0', fish, {unallocated}, tiger, ]
 */
static int onode_lookup(struct kv_onode *onode, uint32_t ovid, const char *name)
{
	struct sd_inode *inode = xmalloc(sizeof(struct sd_inode));
	uint32_t tmp_vid, idx;
	uint64_t hval, i;
	int ret;

	sys->cdrv->lock(ovid);
	ret = sd_read_object(vid_to_vdi_oid(ovid), (char *)inode,
			     sizeof(*inode), 0);
	if (ret != SD_RES_SUCCESS) {
		sd_err("failed to read %" PRIx32 " %s", ovid,
		       sd_strerror(ret));
		goto out;
	}

	hval = sd_hash(name, strlen(name));
	for (i = 0; i < MAX_DATA_OBJS; i++) {
		idx = (hval + i) % MAX_DATA_OBJS;
		tmp_vid = INODE_GET_VID(inode, idx);
		if (tmp_vid) {
			uint64_t oid = vid_to_data_oid(ovid, idx);

			ret = sd_read_object(oid, (char *)onode,
					     sizeof(*onode), 0);
			if (ret != SD_RES_SUCCESS)
				goto out;
			if (strcmp(onode->name, name) == 0)
				break;
		} else {
			ret = SD_RES_NO_OBJ;
			break;
		}
	}
	if (i == MAX_DATA_OBJS) {
		ret = SD_RES_NO_OBJ;
		goto out;
	}
out:
	free(inode);
	sys->cdrv->unlock(ovid);
	return ret;
}
Exemple #12
0
static int bucket_iterate_object(uint32_t bucket_vid, object_iter_cb cb,
				 void *opaque)
{
	struct object_iterater_arg arg = {opaque, cb, 0};
	struct sd_inode *inode;
	int ret;

	inode = xmalloc(sizeof(*inode));
	ret = sd_read_object(vid_to_vdi_oid(bucket_vid), (char *)inode,
			     sizeof(struct sd_inode), 0);
	if (ret != SD_RES_SUCCESS) {
		sd_err("failed to read inode %s", sd_strerror(ret));
		goto out;
	}

	traverse_btree(sheep_bnode_reader, inode, object_iterater, &arg);
out:
	free(inode);
	return ret;
}
Exemple #13
0
static int bnode_lookup(struct kv_bnode *bnode, uint32_t vid, const char *name)
{
	uint64_t hval, i;
	int ret;

	hval = sd_hash(name, strlen(name));
	for (i = 0; i < MAX_DATA_OBJS; i++) {
		uint32_t idx = (hval + i) % MAX_DATA_OBJS;
		uint64_t oid = vid_to_data_oid(vid, idx);

		ret = sd_read_object(oid, (char *)bnode, sizeof(*bnode), 0);
		if (ret != SD_RES_SUCCESS)
			goto out;
		if (strcmp(bnode->name, name) == 0)
			break;
	}

	if (i == MAX_DATA_OBJS)
		ret = SD_RES_NO_OBJ;
out:
	return ret;
}
Exemple #14
0
int parse_vdi(vdi_parser_func_t func, size_t size, void *data)
{
	int ret, fd;
	unsigned long nr;
	static struct sheepdog_inode i;
	struct sd_req req;
	static DECLARE_BITMAP(vdi_inuse, SD_NR_VDIS);
	unsigned int rlen = sizeof(vdi_inuse);

	fd = connect_to(sdhost, sdport);
	if (fd < 0) {
		fprintf(stderr, "Failed to connect to %s:%d\n", sdhost, sdport);
		ret = -1;
		goto out;
	}

	sd_init_req(&req, SD_OP_READ_VDIS);
	req.data_length = sizeof(vdi_inuse);

	ret = collie_exec_req(fd, &req, &vdi_inuse);
	if (ret < 0) {
		fprintf(stderr, "Failed to read VDIs from %s:%d\n",
			sdhost, sdport);
		close(fd);
		goto out;
	}
	close(fd);

	for (nr = 0; nr < SD_NR_VDIS; nr++) {
		uint64_t oid;
		uint32_t snapid;

		if (!test_bit(nr, vdi_inuse))
			continue;

		oid = vid_to_vdi_oid(nr);

		memset(&i, 0, sizeof(i));
		ret = sd_read_object(oid, &i, SD_INODE_HEADER_SIZE, 0, true);
		if (ret != SD_RES_SUCCESS) {
			fprintf(stderr, "Failed to read inode header\n");
			continue;
		}

		if (i.name[0] == '\0') /* this VDI has been deleted */
			continue;

		if (size > SD_INODE_HEADER_SIZE) {
			rlen = DIV_ROUND_UP(i.vdi_size, SD_DATA_OBJ_SIZE) *
				sizeof(i.data_vdi_id[0]);
			if (rlen > size - SD_INODE_HEADER_SIZE)
				rlen = size - SD_INODE_HEADER_SIZE;

			ret = sd_read_object(oid, ((char *)&i) + SD_INODE_HEADER_SIZE,
					     rlen, SD_INODE_HEADER_SIZE, true);

			if (ret != SD_RES_SUCCESS) {
				fprintf(stderr, "Failed to read inode\n");
				continue;
			}
		}

		snapid = is_current(&i) ? 0 : i.snap_id;
		func(i.vdi_id, i.name, i.tag, snapid, 0, &i, data);
	}

out:
	return ret;
}
Exemple #15
0
static int vdi_create(int argc, char **argv)
{
	char *vdiname = argv[optind++];
	uint64_t size;
	uint32_t vid;
	uint64_t oid;
	int idx, max_idx, ret;
	struct sheepdog_inode *inode = NULL;
	char *buf = NULL;

	if (!argv[optind]) {
		fprintf(stderr, "please specify the size of vdi\n");
		return EXIT_USAGE;
	}
	ret = parse_option_size(argv[optind], &size);
	if (ret < 0)
		return EXIT_USAGE;
	if (size > SD_MAX_VDI_SIZE) {
		fprintf(stderr, "too big image size, %s\n", argv[optind]);
		return EXIT_USAGE;
	}

	ret = do_vdi_create(vdiname, size, 0, &vid, 0);
	if (ret != EXIT_SUCCESS || !vdi_cmd_data.prealloc)
		goto out;

	inode = malloc(sizeof(*inode));
	buf = zalloc(SD_DATA_OBJ_SIZE);
	if (!inode || !buf) {
		fprintf(stderr, "oom\n");
		ret = EXIT_SYSFAIL;
		goto out;
	}

	ret = sd_read_object(vid_to_vdi_oid(vid), inode, sizeof(*inode), 0);
	if (ret != SD_RES_SUCCESS) {
		fprintf(stderr, "failed to read a newly created vdi object\n");
		ret = EXIT_FAILURE;
		goto out;
	}
	max_idx = DIV_ROUND_UP(size, SD_DATA_OBJ_SIZE);

	for (idx = 0; idx < max_idx; idx++) {
		oid = vid_to_data_oid(vid, idx);

		ret = sd_write_object(oid, 0, buf, SD_DATA_OBJ_SIZE, 0, 0,
				      inode->nr_copies, 1);
		if (ret != SD_RES_SUCCESS) {
			ret = EXIT_FAILURE;
			goto out;
		}

		inode->data_vdi_id[idx] = vid;
		ret = sd_write_object(vid_to_vdi_oid(vid), 0, &vid, sizeof(vid),
				      SD_INODE_HEADER_SIZE + sizeof(vid) * idx, 0,
				      inode->nr_copies, 0);
		if (ret) {
			ret = EXIT_FAILURE;
			goto out;
		}
	}
	ret = EXIT_SUCCESS;
out:
	free(inode);
	free(buf);
	return ret;
}
int parse_vdi(vdi_parser_func_t func, size_t size, void *data)
{
	int ret, fd;
	unsigned long nr;
	static struct sheepdog_inode i;
	struct sd_req req;
	static DECLARE_BITMAP(vdi_inuse, SD_NR_VDIS);
	unsigned int rlen, wlen = 0;

	fd = connect_to(sdhost, sdport);
	if (fd < 0)
		return fd;

	memset(&req, 0, sizeof(req));

	req.opcode = SD_OP_READ_VDIS;
	req.data_length = sizeof(vdi_inuse);
	req.epoch = node_list_version;

	rlen = sizeof(vdi_inuse);
	ret = exec_req(fd, &req, vdi_inuse, &wlen, &rlen);
	if (ret < 0) {
		close(fd);
		return ret;
	}
	close(fd);

	for (nr = 0; nr < SD_NR_VDIS; nr++) {
		uint64_t oid;

		if (!test_bit(nr, vdi_inuse))
			continue;

		oid = vid_to_vdi_oid(nr);

		memset(&i, 0, sizeof(i));
		ret = sd_read_object(oid, &i, SD_INODE_HEADER_SIZE, 0);
		if (ret != SD_RES_SUCCESS) {
			fprintf(stderr, "Failed to read inode header\n");
			continue;
		}

		if (i.name[0] == '\0') /* this VDI has been deleted */
			continue;

		if (size > SD_INODE_HEADER_SIZE) {
			rlen = DIV_ROUND_UP(i.vdi_size, SD_DATA_OBJ_SIZE) *
				sizeof(i.data_vdi_id[0]);
			if (rlen > size - SD_INODE_HEADER_SIZE)
				rlen = size - SD_INODE_HEADER_SIZE;

			ret = sd_read_object(oid, ((char *)&i) + SD_INODE_HEADER_SIZE,
					     rlen, SD_INODE_HEADER_SIZE);

			if (ret != SD_RES_SUCCESS) {
				fprintf(stderr, "Failed to read inode\n");
				continue;
			}
		}

		func(i.vdi_id, i.name, i.tag, i.snap_id, 0, &i, data);
	}

	return 0;
}
Exemple #17
0
int sheep_bnode_reader(uint64_t oid, void **mem, unsigned int len,
		       uint64_t offset)
{
	return sd_read_object(oid, *mem, len, offset);
}
Exemple #18
0
/*
 * Initialize the data vdi
 *
 * @vid: the vdi where the allocator resides
 */
int oalloc_init(uint32_t vid)
{
	struct strbuf buf = STRBUF_INIT;
	struct sd_inode *inode = xmalloc(sizeof(struct sd_inode));
	struct header hd = {
		.nr_free = 1,
	};
	struct free_desc fd = {
		.start = 1, /* Use first object as the meta object */
		.count = MAX_DATA_OBJS - 1,
	};
	int ret;

	strbuf_add(&buf, &hd, sizeof(hd));
	strbuf_add(&buf, &fd, sizeof(fd));

	ret = sd_read_object(vid_to_vdi_oid(vid), (char *)inode,
			     sizeof(*inode), 0);
	if (ret != SD_RES_SUCCESS) {
		sd_err("failed to read inode, %" PRIx32", %s", vid,
		       sd_strerror(ret));
		goto out;
	}
	ret = sd_write_object(vid_to_data_oid(vid, 0), buf.buf,
			      buf.len, 0, true);
	if (ret != SD_RES_SUCCESS) {
		sd_err("failed to create meta object for %" PRIx32", %s", vid,
		       sd_strerror(ret));
		goto out;
	}
	sd_inode_set_vid(inode, 0, vid);
	ret = sd_inode_write_vid(inode, 0, vid, vid, 0, false, false);
	if (ret != SD_RES_SUCCESS) {
		sd_err("failed to update inode, %" PRIx32", %s", vid,
		       sd_strerror(ret));
		goto out;
	}
out:
	strbuf_release(&buf);
	free(inode);
	return ret;
}

/*
 * Allocate the objects and update the free list.
 *
 * Callers are expected to call oalloc_new_finish() to update the inode bitmap
 * after filling up the data.
 *
 * @vid: the vdi where the allocator resides
 * @start: start index of the objects to allocate
 * @count: number of the objects to allocate
 */
int oalloc_new_prepare(uint32_t vid, uint64_t *start, uint64_t count)
{
	char *meta = xvalloc(SD_DATA_OBJ_SIZE);
	struct header *hd;
	struct free_desc *fd;
	uint64_t oid = vid_to_data_oid(vid, 0), i;
	int ret;

	ret = sd_read_object(oid, meta, SD_DATA_OBJ_SIZE, 0);
	if (ret != SD_RES_SUCCESS) {
		sd_err("failed to read meta %" PRIx64 ", %s", oid,
		       sd_strerror(ret));
		goto out;
	}

	hd = (struct header *)meta;
	fd = (struct free_desc *)(meta + oalloc_meta_length(hd)) - 1;
	sd_debug("used %"PRIu64", nr_free %"PRIu64, hd->used, hd->nr_free);
	for (i = 0; i < hd->nr_free; i++, fd--) {
		sd_debug("start %"PRIu64", count %"PRIu64, fd->start,
			 fd->count);
		if (fd->count > count)
			break;
	}
	if (i == hd->nr_free) {
		ret = SD_RES_NO_SPACE;
		goto out;
	}

	*start = fd->start;
	fd->start += count;
	fd->count -= count;
	hd->used += count;

	/* Update the meta object */
	ret = sd_write_object(oid, meta, oalloc_meta_length(hd), 0, false);
	if (ret != SD_RES_SUCCESS)
		sd_err("failed to update meta %"PRIx64 ", %s", oid,
		       sd_strerror(ret));
out:
	free(meta);
	return ret;
}

/*
 * Update the inode map of the vid
 *
 * @vid: the vdi where the allocator resides
 * @start: start index of the objects to update
 * @count: number of the objects to update
 */
int oalloc_new_finish(uint32_t vid, uint64_t start, uint64_t count)
{
	struct sd_inode *inode = xmalloc(sizeof(struct sd_inode));
	int ret;

	ret = sd_read_object(vid_to_vdi_oid(vid), (char *)inode,
			     sizeof(*inode), 0);
	if (ret != SD_RES_SUCCESS) {
		sd_err("failed to read inode, %" PRIx64 ", %s",
		       vid_to_vdi_oid(vid), sd_strerror(ret));
		goto out;
	}

	sd_debug("start %"PRIu64" end %"PRIu64, start, start + count - 1);
	sd_inode_set_vid_range(inode, start, (start + count - 1), vid);

	ret = sd_inode_write(inode, 0, false, false);
	if (ret != SD_RES_SUCCESS) {
		sd_err("failed to update inode, %" PRIx64", %s",
		       vid_to_vdi_oid(vid), sd_strerror(ret));
		goto out;
	}
out:
	free(inode);
	return ret;
}

static int free_desc_cmp(struct free_desc *a, struct free_desc *b)
{
	return -intcmp(a->start, b->start);
}

static inline int update_and_merge_free_desc(char *meta, uint64_t start,
					     uint64_t count, uint32_t vid)
{
	struct header *hd = (struct header *)meta;
	struct free_desc *tail, *fd = HEADER_TO_FREE_DESC(hd);
	uint64_t i, j;

	/* Try our best to merge it in place, or append it to tail */
	for (i = 0; i < hd->nr_free; i++) {
		if (start + count == fd->start) {
			fd->start = start;
			fd->count += count;
			break;
		} else if(fd->start + fd->count == start) {
			fd->count +=count;
			break;
		}
		fd++;
	}

	if (i == hd->nr_free) {
		if (hd->nr_free >= MAX_FREE_DESC)
			return SD_RES_NO_SPACE;

		tail = (struct free_desc *)(meta + oalloc_meta_length(hd));
		tail->start = start;
		tail->count = count;
		hd->nr_free++;
	}

	hd->used -= count;
	xqsort(HEADER_TO_FREE_DESC(hd), hd->nr_free, free_desc_cmp);

	/* Merge as hard as we can */
	j = hd->nr_free - 1;
	tail = (struct free_desc *)(meta + oalloc_meta_length(hd)) - 1;
	for (i = 0; i < j; i++, tail--) {
		struct free_desc *front = tail - 1;

		sd_debug("start %"PRIu64", count %"PRIu64, tail->start,
			 tail->count);
		if (tail->start + tail->count > front->start)
			sd_emerg("bad free descriptor found at %"PRIx32, vid);
		if (tail->start + tail->count == front->start) {
			front->start = tail->start;
			front->count += tail->count;
			memmove(tail, tail + 1, sizeof(*tail) * i);
			hd->nr_free--;
		}
	}

	return SD_RES_SUCCESS;
}

/*
 * Discard the allocated objects and update the free list of the allocator
 *
 * Caller should check the return value since it might fail.
 *
 * @vid: the vdi where the allocator resides
 * @start: start index of the objects to free
 * @count: number of the objects to free
 */
int oalloc_free(uint32_t vid, uint64_t start, uint64_t count)
{
	char *meta = xvalloc(SD_DATA_OBJ_SIZE);
	struct header *hd;
	uint64_t oid = vid_to_data_oid(vid, 0), i;
	struct sd_inode *inode = xmalloc(sizeof(struct sd_inode));
	int ret;

	ret = sd_read_object(vid_to_vdi_oid(vid), (char *)inode,
			     sizeof(*inode), 0);
	if (ret != SD_RES_SUCCESS) {
		sd_err("failed to read inode, %" PRIx64 ", %s",
		       vid_to_vdi_oid(vid), sd_strerror(ret));
		goto out;
	}

	sd_debug("discard start %"PRIu64" end %"PRIu64, start,
		 start + count - 1);
	sd_inode_set_vid_range(inode, start, (start + count - 1), 0);

	ret = sd_inode_write(inode, 0, false, false);
	if (ret != SD_RES_SUCCESS) {
		sd_err("failed to update inode, %" PRIx64", %s",
		       vid_to_vdi_oid(vid), sd_strerror(ret));
		goto out;
	}

	ret = sd_read_object(oid, meta, SD_DATA_OBJ_SIZE, 0);
	if (ret != SD_RES_SUCCESS) {
		sd_err("failed to read meta %" PRIx64 ", %s", oid,
		       sd_strerror(ret));
		goto out;
	}

	ret = update_and_merge_free_desc(meta, start, count, vid);
	if (ret != SD_RES_SUCCESS)
		goto out;

	/* XXX use aio to speed up remove of objects */
	for (i = 0; i < count; i++) {
		struct sd_req hdr;
		int res;

		sd_init_req(&hdr, SD_OP_REMOVE_OBJ);
		hdr.obj.oid = vid_to_data_oid(vid, start + i);
		res = exec_local_req(&hdr, NULL);
		/*
		 * return the error code if it does not
		 * success or can't find obj.
		 */
		if (res != SD_RES_SUCCESS && res != SD_RES_NO_OBJ)
			ret = res;
	}

	hd = (struct header *)meta;
	ret = sd_write_object(oid, meta, oalloc_meta_length(hd), 0, false);
	if (ret != SD_RES_SUCCESS) {
		sd_err("failed to update meta %"PRIx64 ", %s", oid,
		       sd_strerror(ret));
		goto out;
	}
	sd_debug("used %"PRIu64", nr_free %"PRIu64, hd->used, hd->nr_free);
out:
	free(meta);
	free(inode);
	return ret;
}