Example #1
0
static int cluster_info(int argc, char **argv)
{
	int i, ret;
	struct sd_req hdr;
	struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
	struct epoch_log *logs;
	int nr_logs, log_length;
	time_t ti, ct;
	struct tm tm;
	char time_str[128];

	log_length = sd_epoch * sizeof(struct epoch_log);
	logs = xmalloc(log_length);

	sd_init_req(&hdr, SD_OP_STAT_CLUSTER);
	hdr.data_length = log_length;

	ret = dog_exec_req(&sd_nid, &hdr, logs);
	if (ret < 0)
		goto error;

	/* show cluster status */
	if (!raw_output)
		printf("Cluster status: ");
	if (rsp->result == SD_RES_SUCCESS)
		printf("running, auto-recovery %s\n", logs->disable_recovery ?
		       "disabled" : "enabled");
	else
		printf("%s\n", sd_strerror(rsp->result));

	/* show cluster backend store */
	if (cluster_cmd_data.show_store) {
		if (!raw_output)
			printf("Cluster store: ");
		if (rsp->result == SD_RES_SUCCESS) {
			char copy[10];
			int data, parity;
			if (!logs->copy_policy)
				snprintf(copy, sizeof(copy), "%d",
					 logs->nr_copies);
			else {
				ec_policy_to_dp(logs->copy_policy,
						&data, &parity);
				snprintf(copy, sizeof(copy), "%d:%d",
					 data, parity);
			}
			printf("%s with %s redundancy policy\n",
			       logs->drv_name, copy);
		} else
			printf("%s\n", sd_strerror(rsp->result));
	}

	if (!raw_output && rsp->data_length > 0) {
		ct = logs[0].ctime >> 32;
		printf("\nCluster created at %s\n", ctime(&ct));
		printf("Epoch Time           Version\n");
	}
Example #2
0
size_t get_store_objsize(uint64_t oid)
{
	if (is_erasure_oid(oid)) {
		uint8_t policy = get_vdi_copy_policy(oid_to_vid(oid));
		int d;
		ec_policy_to_dp(policy, &d, NULL);
		return SD_DATA_OBJ_SIZE / d;
	}
	return get_objsize(oid);
}
Example #3
0
/*
 * ec_max_data_strip represent max number of data strips in the cluster. When
 * nr_zones < it, we don't purge the stale objects because for erasure coding,
 * there is only one copy of data.
 */
int ec_max_data_strip(void)
{
	int d;

	if (!sys->cinfo.copy_policy)
		return 0;

	ec_policy_to_dp(sys->cinfo.copy_policy, &d, NULL);

	return d;
}
Example #4
0
size_t get_store_objsize(uint8_t copy_policy, uint64_t oid)
{
	if (is_vdi_obj(oid))
		return SD_INODE_SIZE;
	if (copy_policy != 0) {
		int d;

		ec_policy_to_dp(copy_policy, &d, NULL);
		return SD_DATA_OBJ_SIZE / d;
	}
	return get_objsize(oid);
}
Example #5
0
int default_create_and_write(uint64_t oid, const struct siocb *iocb)
{
	char path[PATH_MAX], tmp_path[PATH_MAX];
	int flags = prepare_iocb(oid, iocb, true);
	int ret, fd;
	uint32_t len = iocb->length;
	bool ec = is_erasure_obj(oid, iocb->copy_policy);
	size_t obj_size;

	sd_debug("%"PRIx64, oid);
	get_obj_path(oid, path, sizeof(path));
	get_tmp_obj_path(oid, tmp_path, sizeof(tmp_path));

	if (uatomic_is_true(&sys->use_journal) &&
	    journal_write_store(oid, iocb->buf, iocb->length,
				iocb->offset, true)
	    != SD_RES_SUCCESS) {
		sd_err("turn off journaling");
		uatomic_set_false(&sys->use_journal);
		flags |= O_DSYNC;
		sync();
	}

	fd = open(tmp_path, flags, sd_def_fmode);
	if (fd < 0) {
		if (errno == EEXIST) {
			/*
			 * This happens if node membership changes during object
			 * creation; while gateway retries a CREATE request,
			 * recovery process could also recover the object at the
			 * same time.  They should try to write the same date,
			 * so it is okay to simply return success here.
			 */
			sd_debug("%s exists", tmp_path);
			return SD_RES_SUCCESS;
		}

		sd_err("failed to open %s: %m", tmp_path);
		return err_to_sderr(path, oid, errno);
	}

	if (ec) {
		uint8_t policy = iocb->copy_policy ?:
			get_vdi_copy_policy(oid_to_vid(oid));
		int d;
		ec_policy_to_dp(policy, &d, NULL);
		obj_size = SD_DATA_OBJ_SIZE / d;
	} else
Example #6
0
size_t get_store_objsize(uint8_t copy_policy, uint8_t block_size_shift,
			 uint64_t oid)
{
	if (is_vdi_obj(oid))
		return SD_INODE_SIZE;
	if (is_vdi_btree_obj(oid))
		return SD_INODE_DATA_INDEX_SIZE;

	uint32_t object_size = (UINT32_C(1) << block_size_shift);
	if (copy_policy != 0) {
		int d;

		ec_policy_to_dp(copy_policy, &d, NULL);
		return object_size / d;
	}
	return get_objsize(oid, object_size);
}
Example #7
0
static void finish_requests(struct request *req, struct req_iter *reqs,
			    int nr_to_send)
{
	uint64_t oid = req->rq.obj.oid;
	uint32_t len = req->rq.data_length;
	uint64_t off = req->rq.obj.offset;
	int opcode = req->rq.opcode;
	int start = off / SD_EC_DATA_STRIPE_SIZE;
	int end = DIV_ROUND_UP(off + len, SD_EC_DATA_STRIPE_SIZE), i, j;
	int nr_stripe = end - start;

	if (!is_erasure_oid(oid))
		goto out;

	sd_debug("start %d, end %d, send %d, off %"PRIu64 ", len %"PRIu32,
		 start, end, nr_to_send, off, len);

	/* We need to assemble the data strips into the req buffer for read */
	if (opcode == SD_OP_READ_OBJ) {
		char *p, *buf;
		uint8_t policy = req->rq.obj.copy_policy ?:
			get_vdi_copy_policy(oid_to_vid(req->rq.obj.oid));
		int ed = 0, strip_size;

		buf = malloc(SD_EC_DATA_STRIPE_SIZE * nr_stripe);
		if(unlikely(!buf)) {
			goto out;
		}

		ec_policy_to_dp(policy, &ed, NULL);
		strip_size = SD_EC_DATA_STRIPE_SIZE / ed;

		p = buf;
		for (i = 0; i < nr_stripe; i++) {
			for (j = 0; j < nr_to_send; j++) {
				memcpy(p, reqs[j].buf + strip_size * i,
				       strip_size);
				p += strip_size;
			}
		}
		memcpy(req->data, buf + off % SD_EC_DATA_STRIPE_SIZE, len);
		req->rp.data_length = req->rq.data_length;
		free(buf);
	}
Example #8
0
/*
 * We spread data strips of req along with its parity strips onto replica for
 * write operation. For read we only need to prepare data strip buffers.
 */
static struct req_iter *prepare_erasure_requests(struct request *req, int *nr)
{
	uint32_t len = req->rq.data_length;
	uint64_t off = req->rq.obj.offset;
	int opcode = req->rq.opcode;
	int start = off / SD_EC_DATA_STRIPE_SIZE;
	int end = DIV_ROUND_UP(off + len, SD_EC_DATA_STRIPE_SIZE), i, j;
	int nr_stripe = end - start;
	struct fec *ctx;
	int strip_size, nr_to_send;
	struct req_iter *reqs;
	char *p, *buf = NULL;
	uint8_t policy = req->rq.obj.copy_policy ?:
		get_vdi_copy_policy(oid_to_vid(req->rq.obj.oid));
	int ed = 0, ep = 0, edp;

	edp = ec_policy_to_dp(policy, &ed, &ep);
	ctx = ec_init(ed, edp);
	*nr = nr_to_send = (opcode == SD_OP_READ_OBJ) ? ed : edp;
	strip_size = SD_EC_DATA_STRIPE_SIZE / ed;
	reqs = xzalloc(sizeof(*reqs) * nr_to_send);

	sd_debug("start %d, end %d, send %d, off %"PRIu64 ", len %"PRIu32,
		 start, end, nr_to_send, off, len);

	for (i = 0; i < nr_to_send; i++) {
		int l = strip_size * nr_stripe;

		reqs[i].buf = xmalloc(l);
		reqs[i].dlen = l;
		reqs[i].off = start * strip_size;
		switch (opcode) {
		case SD_OP_CREATE_AND_WRITE_OBJ:
		case SD_OP_WRITE_OBJ:
			reqs[i].wlen = l;
			break;
		default:
			break;
		}
	}

	if (opcode != SD_OP_WRITE_OBJ && opcode != SD_OP_CREATE_AND_WRITE_OBJ)
		goto out; /* Read and remove operation */

	p = buf = init_erasure_buffer(req, SD_EC_DATA_STRIPE_SIZE * nr_stripe);
	if (!buf) {
		sd_err("failed to init erasure buffer %"PRIx64,
		       req->rq.obj.oid);
		for (i = 0; i < nr_to_send; i++)
			free(reqs[i].buf);
		free(reqs);
		reqs = NULL;
		goto out;
	}
	for (i = 0; i < nr_stripe; i++) {
		const uint8_t *ds[ed];
		uint8_t *ps[ep];

		for (j = 0; j < ed; j++)
			ds[j] = reqs[j].buf + strip_size * i;

		for (j = 0; j < ep; j++)
			ps[j] = reqs[ed + j].buf + strip_size * i;

		for (j = 0; j < ed; j++)
			memcpy((uint8_t *)ds[j], p + j * strip_size,
			       strip_size);
		ec_encode(ctx, ds, ps);
		p += SD_EC_DATA_STRIPE_SIZE;
	}
out:
	ec_destroy(ctx);
	free(buf);

	return reqs;
}
Example #9
0
static int cluster_info(int argc, char **argv)
{
	int i, ret;
	struct sd_req hdr;
	struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
	struct epoch_log *logs, *log;
	char *next_log;
	int nr_logs, log_length;
	time_t ti, ct;
	struct tm tm;
	char time_str[128];
	uint32_t nodes_nr;

	nodes_nr = sd_nodes_nr;
	log_length = sd_epoch * (sizeof(struct epoch_log)
			+ nodes_nr * sizeof(struct sd_node));
	logs = xmalloc(log_length);

retry:
	sd_init_req(&hdr, SD_OP_STAT_CLUSTER);
	hdr.data_length = log_length;
	hdr.cluster.nodes_nr = nodes_nr;

	ret = dog_exec_req(&sd_nid, &hdr, logs);
	if (ret < 0)
		goto error;
	if (rsp->result == SD_RES_BUFFER_SMALL) {
		nodes_nr *= 2;
		log_length = sd_epoch * (sizeof(struct epoch_log)
				+ nodes_nr * sizeof(struct sd_node));
		logs = xrealloc(logs, log_length);
		goto retry;
	}

	/* show cluster status */
	if (!raw_output)
		printf("Cluster status: ");
	if (rsp->result == SD_RES_SUCCESS)
		printf("running, auto-recovery enabled\n");
	else
		printf("%s\n", sd_strerror(rsp->result));

	if (verbose) {
		/* show cluster backend store */
		if (!raw_output)
			printf("Cluster store: ");
		if (rsp->result == SD_RES_SUCCESS) {
			char copy[10];
			int data, parity;
			if (!logs->copy_policy)
				snprintf(copy, sizeof(copy), "%d",
					 logs->nr_copies);
			else {
				ec_policy_to_dp(logs->copy_policy,
						&data, &parity);
				snprintf(copy, sizeof(copy), "%d:%d",
					 data, parity);
			}
			printf("%s with %s redundancy policy\n",
			       logs->drv_name, copy);
		} else
			printf("%s\n", sd_strerror(rsp->result));

		/* show vnode mode (node or disk) for cluster */
		if (!raw_output)
			printf("Cluster vnode mode: ");
		if (logs->flags & SD_CLUSTER_FLAG_DISKMODE)
			printf("disk");
		else
			printf("node");
	}

	if (!raw_output && rsp->data_length > 0) {
		ct = logs[0].ctime >> 32;
		printf("\nCluster created at %s\n", ctime(&ct));
		printf("Epoch Time           Version\n");
	}