static int cluster_info(int argc, char **argv) { int i, ret; struct sd_req hdr; struct sd_rsp *rsp = (struct sd_rsp *)&hdr; struct epoch_log *logs; int nr_logs, log_length; time_t ti, ct; struct tm tm; char time_str[128]; log_length = sd_epoch * sizeof(struct epoch_log); logs = xmalloc(log_length); sd_init_req(&hdr, SD_OP_STAT_CLUSTER); hdr.data_length = log_length; ret = dog_exec_req(&sd_nid, &hdr, logs); if (ret < 0) goto error; /* show cluster status */ if (!raw_output) printf("Cluster status: "); if (rsp->result == SD_RES_SUCCESS) printf("running, auto-recovery %s\n", logs->disable_recovery ? "disabled" : "enabled"); else printf("%s\n", sd_strerror(rsp->result)); /* show cluster backend store */ if (cluster_cmd_data.show_store) { if (!raw_output) printf("Cluster store: "); if (rsp->result == SD_RES_SUCCESS) { char copy[10]; int data, parity; if (!logs->copy_policy) snprintf(copy, sizeof(copy), "%d", logs->nr_copies); else { ec_policy_to_dp(logs->copy_policy, &data, &parity); snprintf(copy, sizeof(copy), "%d:%d", data, parity); } printf("%s with %s redundancy policy\n", logs->drv_name, copy); } else printf("%s\n", sd_strerror(rsp->result)); } if (!raw_output && rsp->data_length > 0) { ct = logs[0].ctime >> 32; printf("\nCluster created at %s\n", ctime(&ct)); printf("Epoch Time Version\n"); }
size_t get_store_objsize(uint64_t oid) { if (is_erasure_oid(oid)) { uint8_t policy = get_vdi_copy_policy(oid_to_vid(oid)); int d; ec_policy_to_dp(policy, &d, NULL); return SD_DATA_OBJ_SIZE / d; } return get_objsize(oid); }
/* * ec_max_data_strip represent max number of data strips in the cluster. When * nr_zones < it, we don't purge the stale objects because for erasure coding, * there is only one copy of data. */ int ec_max_data_strip(void) { int d; if (!sys->cinfo.copy_policy) return 0; ec_policy_to_dp(sys->cinfo.copy_policy, &d, NULL); return d; }
size_t get_store_objsize(uint8_t copy_policy, uint64_t oid) { if (is_vdi_obj(oid)) return SD_INODE_SIZE; if (copy_policy != 0) { int d; ec_policy_to_dp(copy_policy, &d, NULL); return SD_DATA_OBJ_SIZE / d; } return get_objsize(oid); }
int default_create_and_write(uint64_t oid, const struct siocb *iocb) { char path[PATH_MAX], tmp_path[PATH_MAX]; int flags = prepare_iocb(oid, iocb, true); int ret, fd; uint32_t len = iocb->length; bool ec = is_erasure_obj(oid, iocb->copy_policy); size_t obj_size; sd_debug("%"PRIx64, oid); get_obj_path(oid, path, sizeof(path)); get_tmp_obj_path(oid, tmp_path, sizeof(tmp_path)); if (uatomic_is_true(&sys->use_journal) && journal_write_store(oid, iocb->buf, iocb->length, iocb->offset, true) != SD_RES_SUCCESS) { sd_err("turn off journaling"); uatomic_set_false(&sys->use_journal); flags |= O_DSYNC; sync(); } fd = open(tmp_path, flags, sd_def_fmode); if (fd < 0) { if (errno == EEXIST) { /* * This happens if node membership changes during object * creation; while gateway retries a CREATE request, * recovery process could also recover the object at the * same time. They should try to write the same date, * so it is okay to simply return success here. */ sd_debug("%s exists", tmp_path); return SD_RES_SUCCESS; } sd_err("failed to open %s: %m", tmp_path); return err_to_sderr(path, oid, errno); } if (ec) { uint8_t policy = iocb->copy_policy ?: get_vdi_copy_policy(oid_to_vid(oid)); int d; ec_policy_to_dp(policy, &d, NULL); obj_size = SD_DATA_OBJ_SIZE / d; } else
size_t get_store_objsize(uint8_t copy_policy, uint8_t block_size_shift, uint64_t oid) { if (is_vdi_obj(oid)) return SD_INODE_SIZE; if (is_vdi_btree_obj(oid)) return SD_INODE_DATA_INDEX_SIZE; uint32_t object_size = (UINT32_C(1) << block_size_shift); if (copy_policy != 0) { int d; ec_policy_to_dp(copy_policy, &d, NULL); return object_size / d; } return get_objsize(oid, object_size); }
static void finish_requests(struct request *req, struct req_iter *reqs, int nr_to_send) { uint64_t oid = req->rq.obj.oid; uint32_t len = req->rq.data_length; uint64_t off = req->rq.obj.offset; int opcode = req->rq.opcode; int start = off / SD_EC_DATA_STRIPE_SIZE; int end = DIV_ROUND_UP(off + len, SD_EC_DATA_STRIPE_SIZE), i, j; int nr_stripe = end - start; if (!is_erasure_oid(oid)) goto out; sd_debug("start %d, end %d, send %d, off %"PRIu64 ", len %"PRIu32, start, end, nr_to_send, off, len); /* We need to assemble the data strips into the req buffer for read */ if (opcode == SD_OP_READ_OBJ) { char *p, *buf; uint8_t policy = req->rq.obj.copy_policy ?: get_vdi_copy_policy(oid_to_vid(req->rq.obj.oid)); int ed = 0, strip_size; buf = malloc(SD_EC_DATA_STRIPE_SIZE * nr_stripe); if(unlikely(!buf)) { goto out; } ec_policy_to_dp(policy, &ed, NULL); strip_size = SD_EC_DATA_STRIPE_SIZE / ed; p = buf; for (i = 0; i < nr_stripe; i++) { for (j = 0; j < nr_to_send; j++) { memcpy(p, reqs[j].buf + strip_size * i, strip_size); p += strip_size; } } memcpy(req->data, buf + off % SD_EC_DATA_STRIPE_SIZE, len); req->rp.data_length = req->rq.data_length; free(buf); }
/* * We spread data strips of req along with its parity strips onto replica for * write operation. For read we only need to prepare data strip buffers. */ static struct req_iter *prepare_erasure_requests(struct request *req, int *nr) { uint32_t len = req->rq.data_length; uint64_t off = req->rq.obj.offset; int opcode = req->rq.opcode; int start = off / SD_EC_DATA_STRIPE_SIZE; int end = DIV_ROUND_UP(off + len, SD_EC_DATA_STRIPE_SIZE), i, j; int nr_stripe = end - start; struct fec *ctx; int strip_size, nr_to_send; struct req_iter *reqs; char *p, *buf = NULL; uint8_t policy = req->rq.obj.copy_policy ?: get_vdi_copy_policy(oid_to_vid(req->rq.obj.oid)); int ed = 0, ep = 0, edp; edp = ec_policy_to_dp(policy, &ed, &ep); ctx = ec_init(ed, edp); *nr = nr_to_send = (opcode == SD_OP_READ_OBJ) ? ed : edp; strip_size = SD_EC_DATA_STRIPE_SIZE / ed; reqs = xzalloc(sizeof(*reqs) * nr_to_send); sd_debug("start %d, end %d, send %d, off %"PRIu64 ", len %"PRIu32, start, end, nr_to_send, off, len); for (i = 0; i < nr_to_send; i++) { int l = strip_size * nr_stripe; reqs[i].buf = xmalloc(l); reqs[i].dlen = l; reqs[i].off = start * strip_size; switch (opcode) { case SD_OP_CREATE_AND_WRITE_OBJ: case SD_OP_WRITE_OBJ: reqs[i].wlen = l; break; default: break; } } if (opcode != SD_OP_WRITE_OBJ && opcode != SD_OP_CREATE_AND_WRITE_OBJ) goto out; /* Read and remove operation */ p = buf = init_erasure_buffer(req, SD_EC_DATA_STRIPE_SIZE * nr_stripe); if (!buf) { sd_err("failed to init erasure buffer %"PRIx64, req->rq.obj.oid); for (i = 0; i < nr_to_send; i++) free(reqs[i].buf); free(reqs); reqs = NULL; goto out; } for (i = 0; i < nr_stripe; i++) { const uint8_t *ds[ed]; uint8_t *ps[ep]; for (j = 0; j < ed; j++) ds[j] = reqs[j].buf + strip_size * i; for (j = 0; j < ep; j++) ps[j] = reqs[ed + j].buf + strip_size * i; for (j = 0; j < ed; j++) memcpy((uint8_t *)ds[j], p + j * strip_size, strip_size); ec_encode(ctx, ds, ps); p += SD_EC_DATA_STRIPE_SIZE; } out: ec_destroy(ctx); free(buf); return reqs; }
static int cluster_info(int argc, char **argv) { int i, ret; struct sd_req hdr; struct sd_rsp *rsp = (struct sd_rsp *)&hdr; struct epoch_log *logs, *log; char *next_log; int nr_logs, log_length; time_t ti, ct; struct tm tm; char time_str[128]; uint32_t nodes_nr; nodes_nr = sd_nodes_nr; log_length = sd_epoch * (sizeof(struct epoch_log) + nodes_nr * sizeof(struct sd_node)); logs = xmalloc(log_length); retry: sd_init_req(&hdr, SD_OP_STAT_CLUSTER); hdr.data_length = log_length; hdr.cluster.nodes_nr = nodes_nr; ret = dog_exec_req(&sd_nid, &hdr, logs); if (ret < 0) goto error; if (rsp->result == SD_RES_BUFFER_SMALL) { nodes_nr *= 2; log_length = sd_epoch * (sizeof(struct epoch_log) + nodes_nr * sizeof(struct sd_node)); logs = xrealloc(logs, log_length); goto retry; } /* show cluster status */ if (!raw_output) printf("Cluster status: "); if (rsp->result == SD_RES_SUCCESS) printf("running, auto-recovery enabled\n"); else printf("%s\n", sd_strerror(rsp->result)); if (verbose) { /* show cluster backend store */ if (!raw_output) printf("Cluster store: "); if (rsp->result == SD_RES_SUCCESS) { char copy[10]; int data, parity; if (!logs->copy_policy) snprintf(copy, sizeof(copy), "%d", logs->nr_copies); else { ec_policy_to_dp(logs->copy_policy, &data, &parity); snprintf(copy, sizeof(copy), "%d:%d", data, parity); } printf("%s with %s redundancy policy\n", logs->drv_name, copy); } else printf("%s\n", sd_strerror(rsp->result)); /* show vnode mode (node or disk) for cluster */ if (!raw_output) printf("Cluster vnode mode: "); if (logs->flags & SD_CLUSTER_FLAG_DISKMODE) printf("disk"); else printf("node"); } if (!raw_output && rsp->data_length > 0) { ct = logs[0].ctime >> 32; printf("\nCluster created at %s\n", ctime(&ct)); printf("Epoch Time Version\n"); }