static int init_vdi_state(uint64_t oid, const char *wd, uint32_t epoch) { int ret; struct sd_inode *inode = xzalloc(SD_INODE_HEADER_SIZE); struct siocb iocb = { .epoch = epoch, .buf = inode, .length = SD_INODE_HEADER_SIZE, }; ret = default_read(oid, &iocb); if (ret != SD_RES_SUCCESS) { sd_err("failed to read inode header %" PRIx64 " %" PRId32 "wat %s", oid, epoch, wd); goto out; } add_vdi_state(oid_to_vid(oid), inode->nr_copies, vdi_is_snapshot(inode), inode->copy_policy); atomic_set_bit(oid_to_vid(oid), sys->vdi_inuse); ret = SD_RES_SUCCESS; out: free(inode); return ret; }
static int init_sys_vdi_bitmap(char *path) { DIR *dir; struct dirent *dent; dir = opendir(path); if (!dir) { vprintf(SDOG_ERR, "failed to open the working directory: %m\n"); return -1; } vprintf(SDOG_INFO, "found the working directory %s\n", path); while ((dent = readdir(dir))) { uint64_t oid; if (!strcmp(dent->d_name, ".")) continue; oid = strtoull(dent->d_name, NULL, 16); if (oid == 0 || oid == ULLONG_MAX) continue; if (!is_vdi_obj(oid)) continue; vprintf(SDOG_DEBUG, "found the VDI object %" PRIx64 "\n", oid); set_bit(oid_to_vid(oid), sys->vdi_inuse); } closedir(dir); return 0; }
static int init_vdi_copy_number(uint64_t oid) { char path[PATH_MAX]; int fd, flags = get_open_flags(oid, false), ret; struct sheepdog_inode *inode = xzalloc(sizeof(*inode)); snprintf(path, sizeof(path), "%s%016" PRIx64, obj_path, oid); fd = open(path, flags); if (fd < 0) { eprintf("failed to open %s, %m\n", path); ret = SD_RES_EIO; goto out; } ret = xpread(fd, inode, SD_INODE_HEADER_SIZE, 0); if (ret != SD_INODE_HEADER_SIZE) { eprintf("failed to read inode header, path=%s, %m\n", path); ret = SD_RES_EIO; goto out; } add_vdi_copy_number(oid_to_vid(oid), inode->nr_copies); ret = SD_RES_SUCCESS; out: free(inode); return SD_RES_SUCCESS; }
/* Trim zero blocks of the beginning and end of the object. */ static int default_trim(int fd, uint64_t oid, const struct siocb *iocb, uint64_t *poffset, uint32_t *plen) { trim_zero_blocks(iocb->buf, poffset, plen); if (iocb->offset < *poffset) { sd_debug("discard between %d, %ld, %016" PRIx64, iocb->offset, *poffset, oid); if (discard(fd, iocb->offset, *poffset) < 0) return -1; } if (*poffset + *plen < iocb->offset + iocb->length) { uint64_t end = iocb->offset + iocb->length; uint32_t object_size = get_vdi_object_size(oid_to_vid(oid)); if (end == get_objsize(oid, object_size)) /* This is necessary to punch the last block */ end = round_up(end, BLOCK_SIZE); sd_debug("discard between %ld, %ld, %016" PRIx64, *poffset + *plen, end, oid); if (discard(fd, *poffset + *plen, end) < 0) return -1; } return 0; }
static int init_vdi_state(uint64_t oid, const char *wd, uint32_t epoch) { int ret; struct sd_inode *inode = xzalloc(SD_INODE_HEADER_SIZE); struct siocb iocb = { .epoch = epoch, .buf = inode, .length = SD_INODE_HEADER_SIZE, }; char path[PATH_MAX]; if (epoch == 0) get_store_path(oid, iocb.ec_index, path); else get_store_stale_path(oid, iocb.epoch, iocb.ec_index, path); ret = default_read_from_path(oid, path, &iocb); if (ret != SD_RES_SUCCESS) { sd_err("failed to read inode header %" PRIx64 " %" PRId32 "at %s", oid, epoch, path); goto out; } atomic_set_bit(oid_to_vid(oid), sys->vdi_inuse); out: free(inode); return ret; }
size_t get_store_objsize(uint64_t oid) { if (is_erasure_oid(oid)) { uint8_t policy = get_vdi_copy_policy(oid_to_vid(oid)); int d; ec_policy_to_dp(policy, &d, NULL); return SD_DATA_OBJ_SIZE / d; } return get_objsize(oid); }
static int init_objlist_and_vdi_bitmap(uint64_t oid, void *arg) { int ret; objlist_cache_insert(oid); if (is_vdi_obj(oid)) { vprintf(SDOG_DEBUG, "found the VDI object %" PRIx64 "\n", oid); set_bit(oid_to_vid(oid), sys->vdi_inuse); ret = init_vdi_copy_number(oid); if (ret != SD_RES_SUCCESS) return ret; } return SD_RES_SUCCESS; }
static int init_objlist_and_vdi_bitmap(uint64_t oid, const char *wd, uint32_t epoch, uint8_t ec_index, struct vnode_info *vinfo, void *arg) { int ret = SD_RES_SUCCESS; objlist_cache_insert(oid); if (is_vdi_obj(oid)) { sd_debug("found the VDI object %" PRIx64" epoch %"PRIu32 " at %s", oid, epoch, wd); atomic_set_bit(oid_to_vid(oid), sys->vdi_inuse); } return ret; }
int default_create_and_write(uint64_t oid, const struct siocb *iocb) { char path[PATH_MAX], tmp_path[PATH_MAX]; int flags = prepare_iocb(oid, iocb, true); int ret, fd; uint32_t len = iocb->length; bool ec = is_erasure_obj(oid, iocb->copy_policy); size_t obj_size; sd_debug("%"PRIx64, oid); get_obj_path(oid, path, sizeof(path)); get_tmp_obj_path(oid, tmp_path, sizeof(tmp_path)); if (uatomic_is_true(&sys->use_journal) && journal_write_store(oid, iocb->buf, iocb->length, iocb->offset, true) != SD_RES_SUCCESS) { sd_err("turn off journaling"); uatomic_set_false(&sys->use_journal); flags |= O_DSYNC; sync(); } fd = open(tmp_path, flags, sd_def_fmode); if (fd < 0) { if (errno == EEXIST) { /* * This happens if node membership changes during object * creation; while gateway retries a CREATE request, * recovery process could also recover the object at the * same time. They should try to write the same date, * so it is okay to simply return success here. */ sd_debug("%s exists", tmp_path); return SD_RES_SUCCESS; } sd_err("failed to open %s: %m", tmp_path); return err_to_sderr(path, oid, errno); } if (ec) { uint8_t policy = iocb->copy_policy ?: get_vdi_copy_policy(oid_to_vid(oid)); int d; ec_policy_to_dp(policy, &d, NULL); obj_size = SD_DATA_OBJ_SIZE / d; } else
int remove_object(uint64_t oid) { struct sd_req hdr; int ret; sd_init_req(&hdr, SD_OP_REMOVE_OBJ); hdr.obj.oid = oid; hdr.obj.copies = get_vdi_copy_number(oid_to_vid(oid)); ret = exec_local_req(&hdr, NULL); if (ret != SD_RES_SUCCESS) sd_eprintf("failed to remove object %" PRIx64 ", %s", oid, sd_strerror(ret)); return ret; }
static void finish_requests(struct request *req, struct req_iter *reqs, int nr_to_send) { uint64_t oid = req->rq.obj.oid; uint32_t len = req->rq.data_length; uint64_t off = req->rq.obj.offset; int opcode = req->rq.opcode; int start = off / SD_EC_DATA_STRIPE_SIZE; int end = DIV_ROUND_UP(off + len, SD_EC_DATA_STRIPE_SIZE), i, j; int nr_stripe = end - start; if (!is_erasure_oid(oid)) goto out; sd_debug("start %d, end %d, send %d, off %"PRIu64 ", len %"PRIu32, start, end, nr_to_send, off, len); /* We need to assemble the data strips into the req buffer for read */ if (opcode == SD_OP_READ_OBJ) { char *p, *buf; uint8_t policy = req->rq.obj.copy_policy ?: get_vdi_copy_policy(oid_to_vid(req->rq.obj.oid)); int ed = 0, strip_size; buf = malloc(SD_EC_DATA_STRIPE_SIZE * nr_stripe); if(unlikely(!buf)) { goto out; } ec_policy_to_dp(policy, &ed, NULL); strip_size = SD_EC_DATA_STRIPE_SIZE / ed; p = buf; for (i = 0; i < nr_stripe; i++) { for (j = 0; j < nr_to_send; j++) { memcpy(p, reqs[j].buf + strip_size * i, strip_size); p += strip_size; } } memcpy(req->data, buf + off % SD_EC_DATA_STRIPE_SIZE, len); req->rp.data_length = req->rq.data_length; free(buf); }
int read_backend_object(uint64_t oid, char *data, unsigned int datalen, uint64_t offset) { struct sd_req hdr; struct sd_rsp *rsp = (struct sd_rsp *)&hdr; int ret; sd_init_req(&hdr, SD_OP_READ_OBJ); hdr.data_length = datalen; hdr.obj.oid = oid; hdr.obj.offset = offset; hdr.obj.copies = get_vdi_copy_number(oid_to_vid(oid)); ret = exec_local_req(&hdr, data); if (ret != SD_RES_SUCCESS) sd_eprintf("failed to read object %" PRIx64 ", %s", oid, sd_strerror(ret)); untrim_zero_blocks(data, rsp->obj.offset, rsp->data_length, datalen); return ret; }
/* Write data to both local object cache (if enabled) and backends */ int write_object(uint64_t oid, char *data, unsigned int datalen, uint64_t offset, bool create) { struct sd_req hdr; int ret; if (sys->enable_object_cache && object_is_cached(oid)) { ret = object_cache_write(oid, data, datalen, offset, create); if (ret == SD_RES_NO_CACHE) goto forward_write; if (ret != 0) { sd_eprintf("write cache failed %"PRIx64" %"PRIx32, oid, ret); return ret; } } forward_write: if (create) sd_init_req(&hdr, SD_OP_CREATE_AND_WRITE_OBJ); else sd_init_req(&hdr, SD_OP_WRITE_OBJ); hdr.flags = SD_FLAG_CMD_WRITE; hdr.data_length = datalen; hdr.obj.oid = oid; hdr.obj.offset = offset; hdr.obj.copies = get_vdi_copy_number(oid_to_vid(oid)); ret = exec_local_req(&hdr, data); if (ret != SD_RES_SUCCESS) sd_eprintf("failed to write object %" PRIx64 ", %s", oid, sd_strerror(ret)); return ret; }
bool is_erasure_oid(uint64_t oid) { return !is_vdi_obj(oid) && !is_vdi_btree_obj(oid) && !is_ledger_object(oid) && get_vdi_copy_policy(oid_to_vid(oid)) > 0; }
/* * We spread data strips of req along with its parity strips onto replica for * write operation. For read we only need to prepare data strip buffers. */ static struct req_iter *prepare_erasure_requests(struct request *req, int *nr) { uint32_t len = req->rq.data_length; uint64_t off = req->rq.obj.offset; int opcode = req->rq.opcode; int start = off / SD_EC_DATA_STRIPE_SIZE; int end = DIV_ROUND_UP(off + len, SD_EC_DATA_STRIPE_SIZE), i, j; int nr_stripe = end - start; struct fec *ctx; int strip_size, nr_to_send; struct req_iter *reqs; char *p, *buf = NULL; uint8_t policy = req->rq.obj.copy_policy ?: get_vdi_copy_policy(oid_to_vid(req->rq.obj.oid)); int ed = 0, ep = 0, edp; edp = ec_policy_to_dp(policy, &ed, &ep); ctx = ec_init(ed, edp); *nr = nr_to_send = (opcode == SD_OP_READ_OBJ) ? ed : edp; strip_size = SD_EC_DATA_STRIPE_SIZE / ed; reqs = xzalloc(sizeof(*reqs) * nr_to_send); sd_debug("start %d, end %d, send %d, off %"PRIu64 ", len %"PRIu32, start, end, nr_to_send, off, len); for (i = 0; i < nr_to_send; i++) { int l = strip_size * nr_stripe; reqs[i].buf = xmalloc(l); reqs[i].dlen = l; reqs[i].off = start * strip_size; switch (opcode) { case SD_OP_CREATE_AND_WRITE_OBJ: case SD_OP_WRITE_OBJ: reqs[i].wlen = l; break; default: break; } } if (opcode != SD_OP_WRITE_OBJ && opcode != SD_OP_CREATE_AND_WRITE_OBJ) goto out; /* Read and remove operation */ p = buf = init_erasure_buffer(req, SD_EC_DATA_STRIPE_SIZE * nr_stripe); if (!buf) { sd_err("failed to init erasure buffer %"PRIx64, req->rq.obj.oid); for (i = 0; i < nr_to_send; i++) free(reqs[i].buf); free(reqs); reqs = NULL; goto out; } for (i = 0; i < nr_stripe; i++) { const uint8_t *ds[ed]; uint8_t *ps[ep]; for (j = 0; j < ed; j++) ds[j] = reqs[j].buf + strip_size * i; for (j = 0; j < ep; j++) ps[j] = reqs[ed + j].buf + strip_size * i; for (j = 0; j < ed; j++) memcpy((uint8_t *)ds[j], p + j * strip_size, strip_size); ec_encode(ctx, ds, ps); p += SD_EC_DATA_STRIPE_SIZE; } out: ec_destroy(ctx); free(buf); return reqs; }
int default_create_and_write(uint64_t oid, const struct siocb *iocb) { char path[PATH_MAX], tmp_path[PATH_MAX], *dir; int flags = prepare_iocb(oid, iocb, true); int ret, fd; uint32_t len = iocb->length; uint32_t object_size = 0; size_t obj_size; uint64_t offset = iocb->offset; sd_debug("%016"PRIx64, oid); get_store_path(oid, iocb->ec_index, path); get_store_tmp_path(oid, iocb->ec_index, tmp_path); if (uatomic_is_true(&sys->use_journal) && journal_write_store(oid, iocb->buf, iocb->length, iocb->offset, true) != SD_RES_SUCCESS) { sd_err("turn off journaling"); uatomic_set_false(&sys->use_journal); flags |= O_SYNC; sync(); } fd = open(tmp_path, flags, sd_def_fmode); if (fd < 0) { if (errno == EEXIST) { /* * This happens if node membership changes during object * creation; while gateway retries a CREATE request, * recovery process could also recover the object at the * same time. They should try to write the same date, * so it is okay to simply return success here. */ sd_debug("%s exists", tmp_path); return SD_RES_SUCCESS; } sd_err("failed to open %s: %m", tmp_path); return err_to_sderr(path, oid, errno); } obj_size = get_store_objsize(oid); trim_zero_blocks(iocb->buf, &offset, &len); object_size = get_vdi_object_size(oid_to_vid(oid)); if (offset != 0 || len != get_objsize(oid, object_size)) { if (is_sparse_object(oid)) ret = xftruncate(fd, obj_size); else ret = prealloc(fd, obj_size); if (ret < 0) { ret = err_to_sderr(path, oid, errno); goto out; } } ret = xpwrite(fd, iocb->buf, len, offset); if (ret != len) { sd_err("failed to write object. %m"); ret = err_to_sderr(path, oid, errno); goto out; } ret = rename(tmp_path, path); if (ret < 0) { sd_err("failed to rename %s to %s: %m", tmp_path, path); ret = err_to_sderr(path, oid, errno); goto out; } close(fd); if (uatomic_is_true(&sys->use_journal) || sys->nosync == true) { objlist_cache_insert(oid); return SD_RES_SUCCESS; } pstrcpy(tmp_path, sizeof(tmp_path), path); dir = dirname(tmp_path); fd = open(dir, O_DIRECTORY | O_RDONLY); if (fd < 0) { sd_err("failed to open directory %s: %m", dir); return err_to_sderr(path, oid, errno); } if (fsync(fd) != 0) { sd_err("failed to write directory %s: %m", dir); ret = err_to_sderr(path, oid, errno); close(fd); if (unlink(path) != 0) sd_err("failed to unlink %s: %m", path); return ret; } close(fd); objlist_cache_insert(oid); return SD_RES_SUCCESS; out: if (unlink(tmp_path) != 0) sd_err("failed to unlink %s: %m", tmp_path); close(fd); return ret; }