int default_create_and_write(uint64_t oid, struct siocb *iocb) { char path[PATH_MAX], tmp_path[PATH_MAX]; int flags = get_open_flags(oid, true); int ret, fd; uint32_t len = iocb->length; get_obj_path(oid, path); get_tmp_obj_path(oid, tmp_path); fd = open(tmp_path, flags, def_fmode); if (fd < 0) { if (errno == EEXIST) /* This happens if node membership changes during object * creation; while gateway retries a CREATE request, * recovery process could also recover the object at the * same time. They should try to write the same date, * so it is okay to simply return success here. */ dprintf("%s exists\n", tmp_path); return SD_RES_SUCCESS; eprintf("failed to open %s: %m\n", tmp_path); return err_to_sderr(oid, errno); } if (iocb->offset != 0 || iocb->length != get_objsize(oid)) { ret = prealloc(fd, get_objsize(oid)); if (ret != SD_RES_SUCCESS) goto out; } ret = xpwrite(fd, iocb->buf, len, iocb->offset); if (ret != len) { eprintf("failed to write object. %m\n"); ret = err_to_sderr(oid, errno); goto out; } ret = rename(tmp_path, path); if (ret < 0) { eprintf("failed to rename %s to %s: %m\n", tmp_path, path); ret = err_to_sderr(oid, errno); goto out; } dprintf("%"PRIx64"\n", oid); ret = SD_RES_SUCCESS; out: if (ret != SD_RES_SUCCESS) unlink(tmp_path); close(fd); return ret; }
/* Trim zero blocks of the beginning and end of the object. */ static int default_trim(int fd, uint64_t oid, const struct siocb *iocb, uint64_t *poffset, uint32_t *plen) { trim_zero_blocks(iocb->buf, poffset, plen); if (iocb->offset < *poffset) { sd_debug("discard between %d, %ld, %" PRIx64, iocb->offset, *poffset, oid); if (discard(fd, iocb->offset, *poffset) < 0) return -1; } if (*poffset + *plen < iocb->offset + iocb->length) { uint64_t end = iocb->offset + iocb->length; if (end == get_objsize(oid)) /* This is necessary to punch the last block */ end = round_up(end, BLOCK_SIZE); sd_debug("discard between %ld, %ld, %" PRIx64, *poffset + *plen, end, oid); if (discard(fd, *poffset + *plen, end) < 0) return -1; } return 0; }
size_t get_store_objsize(uint64_t oid) { if (is_erasure_oid(oid)) { uint8_t policy = get_vdi_copy_policy(oid_to_vid(oid)); int d; ec_policy_to_dp(policy, &d, NULL); return SD_DATA_OBJ_SIZE / d; } return get_objsize(oid); }
static int farm_write(uint64_t oid, struct siocb *iocb, int create) { int flags = def_open_flags, fd, ret = SD_RES_SUCCESS; char path[PATH_MAX]; ssize_t size; if (iocb->epoch < sys_epoch()) { dprintf("%"PRIu32" sys %"PRIu32"\n", iocb->epoch, sys_epoch()); return SD_RES_OLD_NODE_VER; } if (!is_data_obj(oid)) flags &= ~O_DIRECT; if (create) flags |= O_CREAT | O_TRUNC; sprintf(path, "%s%016"PRIx64, obj_path, oid); fd = open(path, flags, def_fmode); if (fd < 0) return err_to_sderr(oid, errno); if (flock(fd, LOCK_EX) < 0) { ret = SD_RES_EIO; eprintf("%m\n"); goto out; } if (create && !(iocb->flags & SD_FLAG_CMD_COW)) { ret = prealloc(fd, get_objsize(oid)); if (ret != SD_RES_SUCCESS) { if (flock(fd, LOCK_UN) < 0) { ret = SD_RES_EIO; eprintf("%m\n"); goto out; } goto out; } } size = xpwrite(fd, iocb->buf, iocb->length, iocb->offset); if (flock(fd, LOCK_UN) < 0) { ret = SD_RES_EIO; eprintf("%m\n"); goto out; } if (size != iocb->length) { eprintf("%m\n"); ret = SD_RES_EIO; goto out; } trunk_update_entry(oid); out: close(fd); return ret; }
size_t get_store_objsize(uint8_t copy_policy, uint64_t oid) { if (is_vdi_obj(oid)) return SD_INODE_SIZE; if (copy_policy != 0) { int d; ec_policy_to_dp(copy_policy, &d, NULL); return SD_DATA_OBJ_SIZE / d; } return get_objsize(oid); }
static int recover_object_from_replica(uint64_t oid, const struct sd_vnode *vnode, uint32_t epoch, uint32_t tgt_epoch) { struct sd_req hdr; struct sd_rsp *rsp = (struct sd_rsp *)&hdr; unsigned rlen; int ret = SD_RES_NO_MEM; void *buf = NULL; struct siocb iocb = { 0 }; if (vnode_is_local(vnode)) { ret = sd_store->link(oid, tgt_epoch); goto out; } rlen = get_objsize(oid); buf = valloc(rlen); if (!buf) { sd_eprintf("%m"); goto out; } sd_init_req(&hdr, SD_OP_READ_PEER); hdr.epoch = epoch; hdr.flags = SD_FLAG_CMD_RECOVERY; hdr.data_length = rlen; hdr.obj.oid = oid; hdr.obj.tgt_epoch = tgt_epoch; ret = sheep_exec_req(&vnode->nid, &hdr, buf); if (ret != SD_RES_SUCCESS) goto out; iocb.epoch = epoch; iocb.length = rsp->data_length; iocb.offset = rsp->obj.offset; iocb.buf = buf; ret = sd_store->create_and_write(oid, &iocb); out: if (ret == SD_RES_SUCCESS) { sd_dprintf("recovered oid %"PRIx64" from %d to epoch %d", oid, tgt_epoch, epoch); objlist_cache_insert(oid); } free(buf); return ret; }
int default_get_hash(uint64_t oid, uint32_t epoch, uint8_t *sha1) { int ret; void *buf; struct siocb iocb = {}; uint32_t length; bool is_readonly_obj = oid_is_readonly(oid); char path[PATH_MAX]; ret = get_object_path(oid, epoch, path); if (ret != SD_RES_SUCCESS) return ret; if (is_readonly_obj) { if (get_object_sha1(path, sha1) == 0) { sd_debug("use cached sha1 digest %s", sha1_to_hex(sha1)); return SD_RES_SUCCESS; } } length = get_objsize(oid); buf = valloc(length); if (buf == NULL) return SD_RES_NO_MEM; iocb.epoch = epoch; iocb.buf = buf; iocb.length = length; ret = default_read_from_path(oid, path, &iocb); if (ret != SD_RES_SUCCESS) { free(buf); return ret; } sha1_from_buffer(buf, length, sha1); free(buf); sd_debug("the message digest of %"PRIx64" at epoch %d is %s", oid, epoch, sha1_to_hex(sha1)); if (is_readonly_obj) set_object_sha1(path, sha1); return ret; }
size_t get_store_objsize(uint8_t copy_policy, uint8_t block_size_shift, uint64_t oid) { if (is_vdi_obj(oid)) return SD_INODE_SIZE; if (is_vdi_btree_obj(oid)) return SD_INODE_DATA_INDEX_SIZE; uint32_t object_size = (UINT32_C(1) << block_size_shift); if (copy_policy != 0) { int d; ec_policy_to_dp(copy_policy, &d, NULL); return object_size / d; } return get_objsize(oid, object_size); }
int default_create_and_write(uint64_t oid, const struct siocb *iocb) { char path[PATH_MAX], tmp_path[PATH_MAX]; int flags = prepare_iocb(oid, iocb, true); int ret, fd; uint32_t len = iocb->length; size_t obj_size; uint64_t offset = iocb->offset; sd_debug("%"PRIx64, oid); get_store_path(oid, iocb->ec_index, path); get_store_tmp_path(oid, iocb->ec_index, tmp_path); if (uatomic_is_true(&sys->use_journal) && journal_write_store(oid, iocb->buf, iocb->length, iocb->offset, true) != SD_RES_SUCCESS) { sd_err("turn off journaling"); uatomic_set_false(&sys->use_journal); flags |= O_DSYNC; sync(); } fd = open(tmp_path, flags, sd_def_fmode); if (fd < 0) { if (errno == EEXIST) { /* * This happens if node membership changes during object * creation; while gateway retries a CREATE request, * recovery process could also recover the object at the * same time. They should try to write the same date, * so it is okay to simply return success here. */ sd_debug("%s exists", tmp_path); return SD_RES_SUCCESS; } sd_err("failed to open %s: %m", tmp_path); return err_to_sderr(path, oid, errno); } obj_size = get_store_objsize(oid); trim_zero_blocks(iocb->buf, &offset, &len); if (offset != 0 || len != get_objsize(oid)) { if (is_sparse_object(oid)) ret = xftruncate(fd, obj_size); else ret = prealloc(fd, obj_size); if (ret < 0) { ret = err_to_sderr(path, oid, errno); goto out; } } ret = xpwrite(fd, iocb->buf, len, offset); if (ret != len) { sd_err("failed to write object. %m"); ret = err_to_sderr(path, oid, errno); goto out; } ret = rename(tmp_path, path); if (ret < 0) { sd_err("failed to rename %s to %s: %m", tmp_path, path); ret = err_to_sderr(path, oid, errno); goto out; } ret = SD_RES_SUCCESS; objlist_cache_insert(oid); out: if (ret != SD_RES_SUCCESS) unlink(tmp_path); close(fd); return ret; }
static int recover_object_from_replica(uint64_t oid, struct sd_vnode *entry, uint32_t epoch, uint32_t tgt_epoch) { struct sd_req hdr; struct sd_rsp *rsp = (struct sd_rsp *)&hdr; char name[128]; unsigned wlen = 0, rlen; int fd, ret = -1; void *buf; struct siocb iocb = { 0 }; rlen = get_objsize(oid); buf = valloc(rlen); if (!buf) { eprintf("%m\n"); goto out; } if (vnode_is_local(entry)) { iocb.epoch = epoch; iocb.length = rlen; ret = sd_store->link(oid, &iocb, tgt_epoch); if (ret == SD_RES_SUCCESS) { ret = 0; goto done; } else { ret = -1; goto out; } } addr_to_str(name, sizeof(name), entry->nid.addr, 0); fd = connect_to(name, entry->nid.port); dprintf("%s, %d\n", name, entry->nid.port); if (fd < 0) { eprintf("failed to connect to %s:%"PRIu32"\n", name, entry->nid.port); ret = -1; goto out; } sd_init_req(&hdr, SD_OP_READ_PEER); hdr.epoch = epoch; hdr.flags = SD_FLAG_CMD_RECOVERY; hdr.data_length = rlen; hdr.obj.oid = oid; hdr.obj.tgt_epoch = tgt_epoch; ret = exec_req(fd, &hdr, buf, &wlen, &rlen); close(fd); if (ret != 0) { eprintf("res: %"PRIx32"\n", rsp->result); ret = -1; goto out; } rsp = (struct sd_rsp *)&hdr; if (rsp->result == SD_RES_SUCCESS) { iocb.epoch = epoch; iocb.length = rlen; iocb.buf = buf; ret = sd_store->atomic_put(oid, &iocb); if (ret != SD_RES_SUCCESS) { ret = -1; goto out; } } else { eprintf("failed, res: %"PRIx32"\n", rsp->result); ret = rsp->result; goto out; } done: dprintf("recovered oid %"PRIx64" from %d to epoch %d\n", oid, tgt_epoch, epoch); out: if (ret == SD_RES_SUCCESS) objlist_cache_insert(oid); free(buf); return ret; }
int default_create_and_write(uint64_t oid, const struct siocb *iocb) { char path[PATH_MAX], tmp_path[PATH_MAX]; int flags = get_open_flags(oid, true, iocb->flags); int ret, fd; uint32_t len = iocb->length; get_obj_path(oid, path); get_tmp_obj_path(oid, tmp_path); if (uatomic_is_true(&sys->use_journal) && journal_file_write(oid, iocb->buf, iocb->length, iocb->offset, true) != SD_RES_SUCCESS) { sd_eprintf("turn off journaling"); uatomic_set_false(&sys->use_journal); flags |= O_DSYNC; sync(); } fd = open(tmp_path, flags, def_fmode); if (fd < 0) { if (errno == EEXIST) { /* * This happens if node membership changes during object * creation; while gateway retries a CREATE request, * recovery process could also recover the object at the * same time. They should try to write the same date, * so it is okay to simply return success here. */ sd_dprintf("%s exists", tmp_path); return SD_RES_SUCCESS; } sd_eprintf("failed to open %s: %m", tmp_path); return err_to_sderr(oid, errno); } if (iocb->offset != 0 || iocb->length != get_objsize(oid)) { ret = prealloc(fd, get_objsize(oid)); if (ret < 0) { ret = err_to_sderr(oid, errno); goto out; } } ret = xpwrite(fd, iocb->buf, len, iocb->offset); if (ret != len) { sd_eprintf("failed to write object. %m"); ret = err_to_sderr(oid, errno); goto out; } ret = rename(tmp_path, path); if (ret < 0) { sd_eprintf("failed to rename %s to %s: %m", tmp_path, path); ret = err_to_sderr(oid, errno); goto out; } sd_dprintf("%"PRIx64, oid); ret = SD_RES_SUCCESS; out: if (ret != SD_RES_SUCCESS) unlink(tmp_path); close(fd); return ret; }
sd_debug("%s exists", tmp_path); return SD_RES_SUCCESS; } sd_err("failed to open %s: %m", tmp_path); return err_to_sderr(path, oid, errno); } if (ec) { uint8_t policy = iocb->copy_policy ?: get_vdi_copy_policy(oid_to_vid(oid)); int d; ec_policy_to_dp(policy, &d, NULL); obj_size = SD_DATA_OBJ_SIZE / d; } else obj_size = get_objsize(oid); ret = prealloc(fd, obj_size); if (ret < 0) { ret = err_to_sderr(path, oid, errno); goto out; } ret = xpwrite(fd, iocb->buf, len, iocb->offset); if (ret != len) { sd_err("failed to write object. %m"); ret = err_to_sderr(path, oid, errno); goto out; } ret = rename(tmp_path, path);