int trunk_init(void) { DIR *dir; struct dirent *d; uint64_t oid; dir = opendir(obj_path); if (!dir) return -1; while ((d = readdir(dir))) { if (!strncmp(d->d_name, ".", 1)) continue; oid = strtoull(d->d_name, NULL, 16); if (oid == 0 || oid == ULLONG_MAX) continue; objlist_cache_insert(oid); lookup_trunk_entry(oid, 1); } omap_tree = RB_ROOT; closedir(dir); return 0; }
static int init_objlist_and_vdi_bitmap(uint64_t oid, char *wd, uint32_t epoch, void *arg) { int ret; objlist_cache_insert(oid); if (is_vdi_obj(oid)) { sd_debug("found the VDI object %" PRIx64, oid); ret = init_vdi_state(oid, wd, epoch); if (ret != SD_RES_SUCCESS) return ret; } return SD_RES_SUCCESS; }
static int init_objlist_and_vdi_bitmap(uint64_t oid, void *arg) { int ret; objlist_cache_insert(oid); if (is_vdi_obj(oid)) { vprintf(SDOG_DEBUG, "found the VDI object %" PRIx64 "\n", oid); set_bit(oid_to_vid(oid), sys->vdi_inuse); ret = init_vdi_copy_number(oid); if (ret != SD_RES_SUCCESS) return ret; } return SD_RES_SUCCESS; }
static int init_objlist_and_vdi_bitmap(uint64_t oid, const char *wd, uint32_t epoch, uint8_t ec_index, struct vnode_info *vinfo, void *arg) { int ret = SD_RES_SUCCESS; objlist_cache_insert(oid); if (is_vdi_obj(oid)) { sd_debug("found the VDI object %" PRIx64" epoch %"PRIu32 " at %s", oid, epoch, wd); atomic_set_bit(oid_to_vid(oid), sys->vdi_inuse); } return ret; }
static int recover_object_from_replica(uint64_t oid, const struct sd_vnode *vnode, uint32_t epoch, uint32_t tgt_epoch) { struct sd_req hdr; struct sd_rsp *rsp = (struct sd_rsp *)&hdr; unsigned rlen; int ret = SD_RES_NO_MEM; void *buf = NULL; struct siocb iocb = { 0 }; if (vnode_is_local(vnode)) { ret = sd_store->link(oid, tgt_epoch); goto out; } rlen = get_objsize(oid); buf = valloc(rlen); if (!buf) { sd_eprintf("%m"); goto out; } sd_init_req(&hdr, SD_OP_READ_PEER); hdr.epoch = epoch; hdr.flags = SD_FLAG_CMD_RECOVERY; hdr.data_length = rlen; hdr.obj.oid = oid; hdr.obj.tgt_epoch = tgt_epoch; ret = sheep_exec_req(&vnode->nid, &hdr, buf); if (ret != SD_RES_SUCCESS) goto out; iocb.epoch = epoch; iocb.length = rsp->data_length; iocb.offset = rsp->obj.offset; iocb.buf = buf; ret = sd_store->create_and_write(oid, &iocb); out: if (ret == SD_RES_SUCCESS) { sd_dprintf("recovered oid %"PRIx64" from %d to epoch %d", oid, tgt_epoch, epoch); objlist_cache_insert(oid); } free(buf); return ret; }
static int init_objlist_and_vdi_bitmap(uint64_t oid, const char *wd, uint32_t epoch, uint8_t ec_index, struct vnode_info *vinfo, void *arg) { int ret; objlist_cache_insert(oid); if (is_vdi_obj(oid)) { sd_debug("found the VDI object %" PRIx64" epoch %"PRIu32 " at %s", oid, epoch, wd); ret = init_vdi_state(oid, wd, epoch); if (ret != SD_RES_SUCCESS) return ret; } return SD_RES_SUCCESS; }
int default_create_and_write(uint64_t oid, const struct siocb *iocb) { char path[PATH_MAX], tmp_path[PATH_MAX]; int flags = prepare_iocb(oid, iocb, true); int ret, fd; uint32_t len = iocb->length; size_t obj_size; uint64_t offset = iocb->offset; sd_debug("%"PRIx64, oid); get_store_path(oid, iocb->ec_index, path); get_store_tmp_path(oid, iocb->ec_index, tmp_path); if (uatomic_is_true(&sys->use_journal) && journal_write_store(oid, iocb->buf, iocb->length, iocb->offset, true) != SD_RES_SUCCESS) { sd_err("turn off journaling"); uatomic_set_false(&sys->use_journal); flags |= O_DSYNC; sync(); } fd = open(tmp_path, flags, sd_def_fmode); if (fd < 0) { if (errno == EEXIST) { /* * This happens if node membership changes during object * creation; while gateway retries a CREATE request, * recovery process could also recover the object at the * same time. They should try to write the same date, * so it is okay to simply return success here. */ sd_debug("%s exists", tmp_path); return SD_RES_SUCCESS; } sd_err("failed to open %s: %m", tmp_path); return err_to_sderr(path, oid, errno); } obj_size = get_store_objsize(oid); trim_zero_blocks(iocb->buf, &offset, &len); if (offset != 0 || len != get_objsize(oid)) { if (is_sparse_object(oid)) ret = xftruncate(fd, obj_size); else ret = prealloc(fd, obj_size); if (ret < 0) { ret = err_to_sderr(path, oid, errno); goto out; } } ret = xpwrite(fd, iocb->buf, len, offset); if (ret != len) { sd_err("failed to write object. %m"); ret = err_to_sderr(path, oid, errno); goto out; } ret = rename(tmp_path, path); if (ret < 0) { sd_err("failed to rename %s to %s: %m", tmp_path, path); ret = err_to_sderr(path, oid, errno); goto out; } ret = SD_RES_SUCCESS; objlist_cache_insert(oid); out: if (ret != SD_RES_SUCCESS) unlink(tmp_path); close(fd); return ret; }
static int recover_object_from_replica(uint64_t oid, struct sd_vnode *entry, uint32_t epoch, uint32_t tgt_epoch) { struct sd_req hdr; struct sd_rsp *rsp = (struct sd_rsp *)&hdr; char name[128]; unsigned wlen = 0, rlen; int fd, ret = -1; void *buf; struct siocb iocb = { 0 }; rlen = get_objsize(oid); buf = valloc(rlen); if (!buf) { eprintf("%m\n"); goto out; } if (vnode_is_local(entry)) { iocb.epoch = epoch; iocb.length = rlen; ret = sd_store->link(oid, &iocb, tgt_epoch); if (ret == SD_RES_SUCCESS) { ret = 0; goto done; } else { ret = -1; goto out; } } addr_to_str(name, sizeof(name), entry->nid.addr, 0); fd = connect_to(name, entry->nid.port); dprintf("%s, %d\n", name, entry->nid.port); if (fd < 0) { eprintf("failed to connect to %s:%"PRIu32"\n", name, entry->nid.port); ret = -1; goto out; } sd_init_req(&hdr, SD_OP_READ_PEER); hdr.epoch = epoch; hdr.flags = SD_FLAG_CMD_RECOVERY; hdr.data_length = rlen; hdr.obj.oid = oid; hdr.obj.tgt_epoch = tgt_epoch; ret = exec_req(fd, &hdr, buf, &wlen, &rlen); close(fd); if (ret != 0) { eprintf("res: %"PRIx32"\n", rsp->result); ret = -1; goto out; } rsp = (struct sd_rsp *)&hdr; if (rsp->result == SD_RES_SUCCESS) { iocb.epoch = epoch; iocb.length = rlen; iocb.buf = buf; ret = sd_store->atomic_put(oid, &iocb); if (ret != SD_RES_SUCCESS) { ret = -1; goto out; } } else { eprintf("failed, res: %"PRIx32"\n", rsp->result); ret = rsp->result; goto out; } done: dprintf("recovered oid %"PRIx64" from %d to epoch %d\n", oid, tgt_epoch, epoch); out: if (ret == SD_RES_SUCCESS) objlist_cache_insert(oid); free(buf); return ret; }
int default_create_and_write(uint64_t oid, const struct siocb *iocb) { char path[PATH_MAX], tmp_path[PATH_MAX]; int flags = prepare_iocb(oid, iocb, true); int ret, fd; uint32_t len = iocb->length; size_t obj_size; sd_debug("%"PRIx64, oid); get_store_path(oid, iocb->ec_index, path); get_store_tmp_path(oid, iocb->ec_index, tmp_path); fd = open(tmp_path, flags, sd_def_fmode); if (fd < 0) { if (errno == EEXIST) { /* * This happens if node membership changes during object * creation; while gateway retries a CREATE request, * recovery process could also recover the object at the * same time. They should try to write the same date, * so it is okay to simply return success here. */ sd_debug("%s exists", tmp_path); return SD_RES_SUCCESS; } sd_err("failed to open %s: %m", tmp_path); return err_to_sderr(path, oid, errno); } obj_size = get_store_objsize(oid); ret = prealloc(fd, obj_size); if (ret < 0) { ret = err_to_sderr(path, oid, errno); goto out; } ret = xpwrite(fd, iocb->buf, len, iocb->offset); if (ret != len) { sd_err("failed to write object. %m"); ret = err_to_sderr(path, oid, errno); goto out; } /* * Modern FS like ext4, xfs defaults to automatic syncing of files after * replace-via-rename and replace-via-truncate operations. So rename * without fsync() is actually safe. */ ret = rename(tmp_path, path); if (ret < 0) { sd_err("failed to rename %s to %s: %m", tmp_path, path); ret = err_to_sderr(path, oid, errno); goto out; } ret = SD_RES_SUCCESS; objlist_cache_insert(oid); out: if (ret != SD_RES_SUCCESS && unlink(tmp_path) != 0) sd_err("failed to unlink %s: %m", tmp_path); close(fd); return ret; }