Пример #1
0
/*
 * Make sure we don't overwrite the existing data for misaligned write
 *
 * If either offset or length of request isn't aligned to
 * SD_EC_DATA_STRIPE_SIZE, we have to read the unaligned blocks before write.
 * This kind of write amplification indeed slow down the write operation with
 * extra read overhead.
 */
static void *init_erasure_buffer(struct request *req, int buf_len)
{
	char *buf;
	uint32_t len = req->rq.data_length;
	uint64_t off = req->rq.obj.offset;
	uint64_t oid = req->rq.obj.oid;
	int opcode = req->rq.opcode;
	struct sd_req hdr;
	uint64_t head = round_down(off, SD_EC_DATA_STRIPE_SIZE);
	uint64_t tail = round_down(off + len, SD_EC_DATA_STRIPE_SIZE);
	int ret;

	buf = valloc(buf_len);
	if(unlikely(!buf))
		return NULL;
	memset(buf, 0, buf_len);

	if (opcode != SD_OP_WRITE_OBJ)
		goto out;

	if (off % SD_EC_DATA_STRIPE_SIZE) {
		/* Read head */
		sd_init_req(&hdr, SD_OP_READ_OBJ);
		hdr.obj.oid = oid;
		hdr.data_length = SD_EC_DATA_STRIPE_SIZE;
		hdr.obj.offset = head;
		ret = exec_local_req(&hdr, buf);
		if (ret != SD_RES_SUCCESS) {
			free(buf);
			return NULL;
		}
	}

	if ((len + off) % SD_EC_DATA_STRIPE_SIZE && tail - head > 0) {
		/* Read tail */
		sd_init_req(&hdr, SD_OP_READ_OBJ);
		hdr.obj.oid = oid;
		hdr.data_length = SD_EC_DATA_STRIPE_SIZE;
		hdr.obj.offset = tail;
		ret = exec_local_req(&hdr, buf + tail - head);
		if (ret != SD_RES_SUCCESS) {
			free(buf);
			return NULL;
		}
	}
out:
	memcpy(buf + off % SD_EC_DATA_STRIPE_SIZE, req->data, len);
	return buf;
}
Пример #2
0
int sd_discard_object(uint64_t oid)
{
	int ret;
	struct sd_req hdr;

	sd_init_req(&hdr, SD_OP_DISCARD_OBJ);
	hdr.obj.oid = oid;

	ret = exec_local_req(&hdr, NULL);
	if (ret != SD_RES_SUCCESS)
		sd_err("Failed to discard data obj %"PRIu64" %s", oid,
		       sd_strerror(ret));

	return ret;
}
Пример #3
0
int remove_object(uint64_t oid)
{
	struct sd_req hdr;
	int ret;

	sd_init_req(&hdr, SD_OP_REMOVE_OBJ);
	hdr.obj.oid = oid;
	hdr.obj.copies = get_vdi_copy_number(oid_to_vid(oid));

	ret = exec_local_req(&hdr, NULL);
	if (ret != SD_RES_SUCCESS)
		sd_eprintf("failed to remove object %" PRIx64 ", %s", oid,
			   sd_strerror(ret));

	return ret;
}
Пример #4
0
static void notify_recovery_completion_work(struct work *work)
{
	struct recovery_work *rw = container_of(work, struct recovery_work,
						work);
	struct sd_req hdr;
	int ret;

	sd_init_req(&hdr, SD_OP_COMPLETE_RECOVERY);
	hdr.obj.tgt_epoch = rw->epoch;
	hdr.flags = SD_FLAG_CMD_WRITE;
	hdr.data_length = sizeof(sys->this_node);

	ret = exec_local_req(&hdr, &sys->this_node);
	if (ret != SD_RES_SUCCESS)
		sd_eprintf("failed to notify recovery completion, %d",
			   rw->epoch);
}
Пример #5
0
int read_backend_object(uint64_t oid, char *data, unsigned int datalen,
			uint64_t offset)
{
	struct sd_req hdr;
	int ret;

	sd_init_req(&hdr, SD_OP_READ_OBJ);
	hdr.data_length = datalen;
	hdr.obj.oid = oid;
	hdr.obj.offset = offset;

	ret = exec_local_req(&hdr, data);
	if (ret != SD_RES_SUCCESS)
		sd_err("failed to read object %" PRIx64 ", %s", oid,
		       sd_strerror(ret));
	return ret;
}
Пример #6
0
int sd_remove_object(uint64_t oid)
{
	struct sd_req hdr;
	int ret;

	if (sys->enable_object_cache && object_is_cached(oid)) {
		ret = object_cache_remove(oid);
		if (ret != SD_RES_SUCCESS)
			return ret;
	}

	sd_init_req(&hdr, SD_OP_REMOVE_OBJ);
	hdr.obj.oid = oid;

	ret = exec_local_req(&hdr, NULL);
	if (ret != SD_RES_SUCCESS)
		sd_err("failed to remove object %" PRIx64 ", %s", oid,
		       sd_strerror(ret));

	return ret;
}
Пример #7
0
int read_backend_object(uint64_t oid, char *data, unsigned int datalen,
			uint64_t offset)
{
	struct sd_req hdr;
	struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
	int ret;

	sd_init_req(&hdr, SD_OP_READ_OBJ);
	hdr.data_length = datalen;
	hdr.obj.oid = oid;
	hdr.obj.offset = offset;
	hdr.obj.copies = get_vdi_copy_number(oid_to_vid(oid));

	ret = exec_local_req(&hdr, data);
	if (ret != SD_RES_SUCCESS)
		sd_eprintf("failed to read object %" PRIx64 ", %s", oid,
			   sd_strerror(ret));

	untrim_zero_blocks(data, rsp->obj.offset, rsp->data_length, datalen);

	return ret;
}
Пример #8
0
/* Write data to both local object cache (if enabled) and backends */
int write_object(uint64_t oid, char *data, unsigned int datalen,
		 uint64_t offset, bool create)
{
	struct sd_req hdr;
	int ret;

	if (sys->enable_object_cache && object_is_cached(oid)) {
		ret = object_cache_write(oid, data, datalen, offset,
					 create);
		if (ret == SD_RES_NO_CACHE)
			goto forward_write;

		if (ret != 0) {
			sd_eprintf("write cache failed %"PRIx64" %"PRIx32, oid,
				   ret);
			return ret;
		}
	}

forward_write:
	if (create)
		sd_init_req(&hdr, SD_OP_CREATE_AND_WRITE_OBJ);
	else
		sd_init_req(&hdr, SD_OP_WRITE_OBJ);
	hdr.flags = SD_FLAG_CMD_WRITE;
	hdr.data_length = datalen;

	hdr.obj.oid = oid;
	hdr.obj.offset = offset;
	hdr.obj.copies = get_vdi_copy_number(oid_to_vid(oid));

	ret = exec_local_req(&hdr, data);
	if (ret != SD_RES_SUCCESS)
		sd_eprintf("failed to write object %" PRIx64 ", %s", oid,
			   sd_strerror(ret));

	return ret;
}
Пример #9
0
/*
 * Initialize the data vdi
 *
 * @vid: the vdi where the allocator resides
 */
int oalloc_init(uint32_t vid)
{
	struct strbuf buf = STRBUF_INIT;
	struct sd_inode *inode = xmalloc(sizeof(struct sd_inode));
	struct header hd = {
		.nr_free = 1,
	};
	struct free_desc fd = {
		.start = 1, /* Use first object as the meta object */
		.count = MAX_DATA_OBJS - 1,
	};
	int ret;

	strbuf_add(&buf, &hd, sizeof(hd));
	strbuf_add(&buf, &fd, sizeof(fd));

	ret = sd_read_object(vid_to_vdi_oid(vid), (char *)inode,
			     sizeof(*inode), 0);
	if (ret != SD_RES_SUCCESS) {
		sd_err("failed to read inode, %" PRIx32", %s", vid,
		       sd_strerror(ret));
		goto out;
	}
	ret = sd_write_object(vid_to_data_oid(vid, 0), buf.buf,
			      buf.len, 0, true);
	if (ret != SD_RES_SUCCESS) {
		sd_err("failed to create meta object for %" PRIx32", %s", vid,
		       sd_strerror(ret));
		goto out;
	}
	sd_inode_set_vid(inode, 0, vid);
	ret = sd_inode_write_vid(inode, 0, vid, vid, 0, false, false);
	if (ret != SD_RES_SUCCESS) {
		sd_err("failed to update inode, %" PRIx32", %s", vid,
		       sd_strerror(ret));
		goto out;
	}
out:
	strbuf_release(&buf);
	free(inode);
	return ret;
}

/*
 * Allocate the objects and update the free list.
 *
 * Callers are expected to call oalloc_new_finish() to update the inode bitmap
 * after filling up the data.
 *
 * @vid: the vdi where the allocator resides
 * @start: start index of the objects to allocate
 * @count: number of the objects to allocate
 */
int oalloc_new_prepare(uint32_t vid, uint64_t *start, uint64_t count)
{
	char *meta = xvalloc(SD_DATA_OBJ_SIZE);
	struct header *hd;
	struct free_desc *fd;
	uint64_t oid = vid_to_data_oid(vid, 0), i;
	int ret;

	ret = sd_read_object(oid, meta, SD_DATA_OBJ_SIZE, 0);
	if (ret != SD_RES_SUCCESS) {
		sd_err("failed to read meta %" PRIx64 ", %s", oid,
		       sd_strerror(ret));
		goto out;
	}

	hd = (struct header *)meta;
	fd = (struct free_desc *)(meta + oalloc_meta_length(hd)) - 1;
	sd_debug("used %"PRIu64", nr_free %"PRIu64, hd->used, hd->nr_free);
	for (i = 0; i < hd->nr_free; i++, fd--) {
		sd_debug("start %"PRIu64", count %"PRIu64, fd->start,
			 fd->count);
		if (fd->count > count)
			break;
	}
	if (i == hd->nr_free) {
		ret = SD_RES_NO_SPACE;
		goto out;
	}

	*start = fd->start;
	fd->start += count;
	fd->count -= count;
	hd->used += count;

	/* Update the meta object */
	ret = sd_write_object(oid, meta, oalloc_meta_length(hd), 0, false);
	if (ret != SD_RES_SUCCESS)
		sd_err("failed to update meta %"PRIx64 ", %s", oid,
		       sd_strerror(ret));
out:
	free(meta);
	return ret;
}

/*
 * Update the inode map of the vid
 *
 * @vid: the vdi where the allocator resides
 * @start: start index of the objects to update
 * @count: number of the objects to update
 */
int oalloc_new_finish(uint32_t vid, uint64_t start, uint64_t count)
{
	struct sd_inode *inode = xmalloc(sizeof(struct sd_inode));
	int ret;

	ret = sd_read_object(vid_to_vdi_oid(vid), (char *)inode,
			     sizeof(*inode), 0);
	if (ret != SD_RES_SUCCESS) {
		sd_err("failed to read inode, %" PRIx64 ", %s",
		       vid_to_vdi_oid(vid), sd_strerror(ret));
		goto out;
	}

	sd_debug("start %"PRIu64" end %"PRIu64, start, start + count - 1);
	sd_inode_set_vid_range(inode, start, (start + count - 1), vid);

	ret = sd_inode_write(inode, 0, false, false);
	if (ret != SD_RES_SUCCESS) {
		sd_err("failed to update inode, %" PRIx64", %s",
		       vid_to_vdi_oid(vid), sd_strerror(ret));
		goto out;
	}
out:
	free(inode);
	return ret;
}

static int free_desc_cmp(struct free_desc *a, struct free_desc *b)
{
	return -intcmp(a->start, b->start);
}

static inline int update_and_merge_free_desc(char *meta, uint64_t start,
					     uint64_t count, uint32_t vid)
{
	struct header *hd = (struct header *)meta;
	struct free_desc *tail, *fd = HEADER_TO_FREE_DESC(hd);
	uint64_t i, j;

	/* Try our best to merge it in place, or append it to tail */
	for (i = 0; i < hd->nr_free; i++) {
		if (start + count == fd->start) {
			fd->start = start;
			fd->count += count;
			break;
		} else if(fd->start + fd->count == start) {
			fd->count +=count;
			break;
		}
		fd++;
	}

	if (i == hd->nr_free) {
		if (hd->nr_free >= MAX_FREE_DESC)
			return SD_RES_NO_SPACE;

		tail = (struct free_desc *)(meta + oalloc_meta_length(hd));
		tail->start = start;
		tail->count = count;
		hd->nr_free++;
	}

	hd->used -= count;
	xqsort(HEADER_TO_FREE_DESC(hd), hd->nr_free, free_desc_cmp);

	/* Merge as hard as we can */
	j = hd->nr_free - 1;
	tail = (struct free_desc *)(meta + oalloc_meta_length(hd)) - 1;
	for (i = 0; i < j; i++, tail--) {
		struct free_desc *front = tail - 1;

		sd_debug("start %"PRIu64", count %"PRIu64, tail->start,
			 tail->count);
		if (tail->start + tail->count > front->start)
			sd_emerg("bad free descriptor found at %"PRIx32, vid);
		if (tail->start + tail->count == front->start) {
			front->start = tail->start;
			front->count += tail->count;
			memmove(tail, tail + 1, sizeof(*tail) * i);
			hd->nr_free--;
		}
	}

	return SD_RES_SUCCESS;
}

/*
 * Discard the allocated objects and update the free list of the allocator
 *
 * Caller should check the return value since it might fail.
 *
 * @vid: the vdi where the allocator resides
 * @start: start index of the objects to free
 * @count: number of the objects to free
 */
int oalloc_free(uint32_t vid, uint64_t start, uint64_t count)
{
	char *meta = xvalloc(SD_DATA_OBJ_SIZE);
	struct header *hd;
	uint64_t oid = vid_to_data_oid(vid, 0), i;
	struct sd_inode *inode = xmalloc(sizeof(struct sd_inode));
	int ret;

	ret = sd_read_object(vid_to_vdi_oid(vid), (char *)inode,
			     sizeof(*inode), 0);
	if (ret != SD_RES_SUCCESS) {
		sd_err("failed to read inode, %" PRIx64 ", %s",
		       vid_to_vdi_oid(vid), sd_strerror(ret));
		goto out;
	}

	sd_debug("discard start %"PRIu64" end %"PRIu64, start,
		 start + count - 1);
	sd_inode_set_vid_range(inode, start, (start + count - 1), 0);

	ret = sd_inode_write(inode, 0, false, false);
	if (ret != SD_RES_SUCCESS) {
		sd_err("failed to update inode, %" PRIx64", %s",
		       vid_to_vdi_oid(vid), sd_strerror(ret));
		goto out;
	}

	ret = sd_read_object(oid, meta, SD_DATA_OBJ_SIZE, 0);
	if (ret != SD_RES_SUCCESS) {
		sd_err("failed to read meta %" PRIx64 ", %s", oid,
		       sd_strerror(ret));
		goto out;
	}

	ret = update_and_merge_free_desc(meta, start, count, vid);
	if (ret != SD_RES_SUCCESS)
		goto out;

	/* XXX use aio to speed up remove of objects */
	for (i = 0; i < count; i++) {
		struct sd_req hdr;
		int res;

		sd_init_req(&hdr, SD_OP_REMOVE_OBJ);
		hdr.obj.oid = vid_to_data_oid(vid, start + i);
		res = exec_local_req(&hdr, NULL);
		/*
		 * return the error code if it does not
		 * success or can't find obj.
		 */
		if (res != SD_RES_SUCCESS && res != SD_RES_NO_OBJ)
			ret = res;
	}

	hd = (struct header *)meta;
	ret = sd_write_object(oid, meta, oalloc_meta_length(hd), 0, false);
	if (ret != SD_RES_SUCCESS) {
		sd_err("failed to update meta %"PRIx64 ", %s", oid,
		       sd_strerror(ret));
		goto out;
	}
	sd_debug("used %"PRIu64", nr_free %"PRIu64, hd->used, hd->nr_free);
out:
	free(meta);
	free(inode);
	return ret;
}