static int default_read_from_path(uint64_t oid, const char *path, const struct siocb *iocb) { int flags = prepare_iocb(oid, iocb, false), fd, ret = SD_RES_SUCCESS; ssize_t size; /* * Make sure oid is in the right place because oid might be misplaced * in a wrong place, due to 'shutdown/restart with less disks' or any * bugs. We need call err_to_sderr() to return EIO if disk is broken. * * For stale path, get_store_stale_path already does default_exist job. */ if (!is_stale_path(path) && !default_exist(oid, iocb->ec_index)) return err_to_sderr(path, oid, ENOENT); fd = open(path, flags); if (fd < 0) return err_to_sderr(path, oid, errno); size = xpread(fd, iocb->buf, iocb->length, iocb->offset); if (unlikely(size != iocb->length)) { sd_err("failed to read object %"PRIx64", path=%s, offset=%" PRId32", size=%"PRId32", result=%zd, %m", oid, path, iocb->offset, iocb->length, size); ret = err_to_sderr(path, oid, errno); } close(fd); return ret; }
int default_write(uint64_t oid, struct siocb *iocb) { int flags = get_open_flags(oid, false), fd, ret = SD_RES_SUCCESS; char path[PATH_MAX]; ssize_t size; if (iocb->epoch < sys_epoch()) { dprintf("%"PRIu32" sys %"PRIu32"\n", iocb->epoch, sys_epoch()); return SD_RES_OLD_NODE_VER; } get_obj_path(oid, path); if (iocb->flags & SD_FLAG_CMD_CACHE && is_disk_cache_enabled()) flags &= ~O_DSYNC; fd = open(path, flags, def_fmode); if (fd < 0) return err_to_sderr(oid, errno); size = xpwrite(fd, iocb->buf, iocb->length, iocb->offset); if (size != iocb->length) { eprintf("failed to write object %"PRIx64", path=%s, offset=%" PRId64", size=%"PRId32", result=%zd, %m\n", oid, path, iocb->offset, iocb->length, size); ret = err_to_sderr(oid, errno); goto out; } out: close(fd); return ret; }
int default_write(uint64_t oid, const struct siocb *iocb) { int flags = prepare_iocb(oid, iocb, false), fd, ret = SD_RES_SUCCESS; char path[PATH_MAX]; ssize_t size; uint32_t len = iocb->length; uint64_t offset = iocb->offset; static bool trim_is_supported = true; if (iocb->epoch < sys_epoch()) { sd_debug("%"PRIu32" sys %"PRIu32, iocb->epoch, sys_epoch()); return SD_RES_OLD_NODE_VER; } if (uatomic_is_true(&sys->use_journal) && unlikely(journal_write_store(oid, iocb->buf, iocb->length, iocb->offset, false)) != SD_RES_SUCCESS) { sd_err("turn off journaling"); uatomic_set_false(&sys->use_journal); flags |= O_DSYNC; sync(); } get_store_path(oid, iocb->ec_index, path); /* * Make sure oid is in the right place because oid might be misplaced * in a wrong place, due to 'shutdown/restart with less/more disks' or * any bugs. We need call err_to_sderr() to return EIO if disk is broken */ if (!default_exist(oid, iocb->ec_index)) return err_to_sderr(path, oid, ENOENT); fd = open(path, flags, sd_def_fmode); if (unlikely(fd < 0)) return err_to_sderr(path, oid, errno); if (trim_is_supported && is_sparse_object(oid)) { if (default_trim(fd, oid, iocb, &offset, &len) < 0) { trim_is_supported = false; offset = iocb->offset; len = iocb->length; } } size = xpwrite(fd, iocb->buf, len, offset); if (unlikely(size != len)) { sd_err("failed to write object %"PRIx64", path=%s, offset=%" PRId32", size=%"PRId32", result=%zd, %m", oid, path, iocb->offset, iocb->length, size); ret = err_to_sderr(path, oid, errno); goto out; } out: close(fd); return ret; }
int default_create_and_write(uint64_t oid, struct siocb *iocb) { char path[PATH_MAX], tmp_path[PATH_MAX]; int flags = get_open_flags(oid, true); int ret, fd; uint32_t len = iocb->length; get_obj_path(oid, path); get_tmp_obj_path(oid, tmp_path); fd = open(tmp_path, flags, def_fmode); if (fd < 0) { if (errno == EEXIST) /* This happens if node membership changes during object * creation; while gateway retries a CREATE request, * recovery process could also recover the object at the * same time. They should try to write the same date, * so it is okay to simply return success here. */ dprintf("%s exists\n", tmp_path); return SD_RES_SUCCESS; eprintf("failed to open %s: %m\n", tmp_path); return err_to_sderr(oid, errno); } if (iocb->offset != 0 || iocb->length != get_objsize(oid)) { ret = prealloc(fd, get_objsize(oid)); if (ret != SD_RES_SUCCESS) goto out; } ret = xpwrite(fd, iocb->buf, len, iocb->offset); if (ret != len) { eprintf("failed to write object. %m\n"); ret = err_to_sderr(oid, errno); goto out; } ret = rename(tmp_path, path); if (ret < 0) { eprintf("failed to rename %s to %s: %m\n", tmp_path, path); ret = err_to_sderr(oid, errno); goto out; } dprintf("%"PRIx64"\n", oid); ret = SD_RES_SUCCESS; out: if (ret != SD_RES_SUCCESS) unlink(tmp_path); close(fd); return ret; }
static int farm_open(uint64_t oid, struct siocb *iocb, int create) { struct strbuf buf = STRBUF_INIT; int ret = SD_RES_SUCCESS, fd; int flags = def_open_flags; if (iocb->epoch < sys->epoch) goto out; if (is_vdi_obj(oid)) flags &= ~O_DIRECT; if (create) flags |= O_CREAT | O_TRUNC; strbuf_addstr(&buf, obj_path); strbuf_addf(&buf, "%016" PRIx64, oid); fd = open(buf.buf, flags, def_fmode); if (fd < 0) { ret = err_to_sderr(oid, errno); goto out; } iocb->fd = fd; ret = SD_RES_SUCCESS; if (!(iocb->flags & SD_FLAG_CMD_COW) && create) { ret = prealloc(fd, iocb->length); if (ret != SD_RES_SUCCESS) close(fd); } out: strbuf_release(&buf); return ret; }
static int farm_write(uint64_t oid, struct siocb *iocb, int create) { int flags = def_open_flags, fd, ret = SD_RES_SUCCESS; char path[PATH_MAX]; ssize_t size; if (iocb->epoch < sys_epoch()) { dprintf("%"PRIu32" sys %"PRIu32"\n", iocb->epoch, sys_epoch()); return SD_RES_OLD_NODE_VER; } if (!is_data_obj(oid)) flags &= ~O_DIRECT; if (create) flags |= O_CREAT | O_TRUNC; sprintf(path, "%s%016"PRIx64, obj_path, oid); fd = open(path, flags, def_fmode); if (fd < 0) return err_to_sderr(oid, errno); if (flock(fd, LOCK_EX) < 0) { ret = SD_RES_EIO; eprintf("%m\n"); goto out; } if (create && !(iocb->flags & SD_FLAG_CMD_COW)) { ret = prealloc(fd, get_objsize(oid)); if (ret != SD_RES_SUCCESS) { if (flock(fd, LOCK_UN) < 0) { ret = SD_RES_EIO; eprintf("%m\n"); goto out; } goto out; } } size = xpwrite(fd, iocb->buf, iocb->length, iocb->offset); if (flock(fd, LOCK_UN) < 0) { ret = SD_RES_EIO; eprintf("%m\n"); goto out; } if (size != iocb->length) { eprintf("%m\n"); ret = SD_RES_EIO; goto out; } trunk_update_entry(oid); out: close(fd); return ret; }
int default_write(uint64_t oid, const struct siocb *iocb) { int flags = get_open_flags(oid, false, iocb->flags), fd, ret = SD_RES_SUCCESS; char path[PATH_MAX]; ssize_t size; if (iocb->epoch < sys_epoch()) { sd_dprintf("%"PRIu32" sys %"PRIu32"\n", iocb->epoch, sys_epoch()); return SD_RES_OLD_NODE_VER; } get_obj_path(oid, path); if (uatomic_is_true(&sys->use_journal) && journal_file_write(oid, iocb->buf, iocb->length, iocb->offset, false) != SD_RES_SUCCESS) { sd_eprintf("turn off journaling\n"); uatomic_set_false(&sys->use_journal); flags |= O_DSYNC; sync(); } fd = open(path, flags, def_fmode); if (fd < 0) return err_to_sderr(oid, errno); size = xpwrite(fd, iocb->buf, iocb->length, iocb->offset); if (size != iocb->length) { sd_eprintf("failed to write object %"PRIx64", path=%s, offset=%" PRId64", size=%"PRId32", result=%zd, %m\n", oid, path, iocb->offset, iocb->length, size); ret = err_to_sderr(oid, errno); goto out; } out: close(fd); return ret; }
/* * Preallocate the whole object to get a better filesystem layout. */ int prealloc(int fd, uint32_t size) { int ret = fallocate(fd, 0, 0, size); if (ret < 0) { if (errno != ENOSYS && errno != EOPNOTSUPP) ret = err_to_sderr(0, errno); /* FIXME: set oid */ else ret = write_last_sector(fd, size); } else ret = SD_RES_SUCCESS; return ret; }
int default_write(uint64_t oid, const struct siocb *iocb) { int flags = prepare_iocb(oid, iocb, false), fd, ret = SD_RES_SUCCESS; char path[PATH_MAX]; ssize_t size; if (iocb->epoch < sys_epoch()) { sd_debug("%"PRIu32" sys %"PRIu32, iocb->epoch, sys_epoch()); return SD_RES_OLD_NODE_VER; } if (uatomic_is_true(&sys->use_journal) && unlikely(journal_write_store(oid, iocb->buf, iocb->length, iocb->offset, false)) != SD_RES_SUCCESS) { sd_err("turn off journaling"); uatomic_set_false(&sys->use_journal); flags |= O_DSYNC; sync(); } get_obj_path(oid, path); fd = open(path, flags, sd_def_fmode); if (unlikely(fd < 0)) return err_to_sderr(path, oid, errno); size = xpwrite(fd, iocb->buf, iocb->length, iocb->offset); if (unlikely(size != iocb->length)) { sd_err("failed to write object %"PRIx64", path=%s, offset=%" PRId64", size=%"PRId32", result=%zd, %m", oid, path, iocb->offset, iocb->length, size); ret = err_to_sderr(path, oid, errno); goto out; } out: close(fd); return ret; }
static int default_read_from_path(uint64_t oid, char *path, const struct siocb *iocb) { int flags = prepare_iocb(oid, iocb, false), fd, ret = SD_RES_SUCCESS; ssize_t size; fd = open(path, flags); if (fd < 0) return err_to_sderr(path, oid, errno); size = xpread(fd, iocb->buf, iocb->length, iocb->offset); if (unlikely(size != iocb->length)) { sd_err("failed to read object %"PRIx64", path=%s, offset=%" PRId64", size=%"PRId32", result=%zd, %m", oid, path, iocb->offset, iocb->length, size); ret = err_to_sderr(path, oid, errno); } close(fd); return ret; }
int default_write(uint64_t oid, const struct siocb *iocb) { int flags = prepare_iocb(oid, iocb, false), fd, ret = SD_RES_SUCCESS; char path[PATH_MAX]; ssize_t size; if (iocb->epoch < sys_epoch()) { sd_debug("%"PRIu32" sys %"PRIu32, iocb->epoch, sys_epoch()); return SD_RES_OLD_NODE_VER; } get_store_path(oid, iocb->ec_index, path); /* * Make sure oid is in the right place because oid might be misplaced * in a wrong place, due to 'shutdown/restart with less/more disks' or * any bugs. We need call err_to_sderr() to return EIO if disk is broken */ if (!default_exist(oid, iocb->ec_index)) return err_to_sderr(path, oid, errno); fd = open(path, flags, sd_def_fmode); if (unlikely(fd < 0)) return err_to_sderr(path, oid, errno); size = xpwrite(fd, iocb->buf, iocb->length, iocb->offset); if (unlikely(size != iocb->length)) { sd_err("failed to write object %"PRIx64", path=%s, offset=%" PRId32", size=%"PRId32", result=%zd, %m", oid, path, iocb->offset, iocb->length, size); ret = err_to_sderr(path, oid, errno); goto out; } out: close(fd); return ret; }
static int default_read_from_path(uint64_t oid, char *path, struct siocb *iocb) { int flags = get_open_flags(oid, false), fd, ret = SD_RES_SUCCESS; ssize_t size; fd = open(path, flags); if (fd < 0) return err_to_sderr(oid, errno); size = xpread(fd, iocb->buf, iocb->length, iocb->offset); if (size != iocb->length) { eprintf("failed to read object %"PRIx64", path=%s, offset=%" PRId64", size=%"PRId32", result=%zd, %m\n", oid, path, iocb->offset, iocb->length, size); ret = err_to_sderr(oid, errno); } close(fd); return ret; }
static int default_read_from_path(uint64_t oid, const char *path, const struct siocb *iocb) { int flags = prepare_iocb(oid, iocb, false), fd, ret = SD_RES_SUCCESS; ssize_t size; fd = open(path, flags); if (fd < 0) return err_to_sderr(path, oid, errno); if (is_erasure_oid(oid) && iocb->ec_index <= SD_MAX_COPIES) { uint8_t idx; if (get_erasure_index(path, &idx) < 0) { close(fd); return err_to_sderr(path, oid, errno); } /* We pretend NO-OBJ to read old object in the stale dir */ if (idx != iocb->ec_index) { sd_debug("ec_index %d != %d", iocb->ec_index, idx); close(fd); return SD_RES_NO_OBJ; } } size = xpread(fd, iocb->buf, iocb->length, iocb->offset); if (unlikely(size != iocb->length)) { sd_err("failed to read object %"PRIx64", path=%s, offset=%" PRId32", size=%"PRId32", result=%zd, %m", oid, path, iocb->offset, iocb->length, size); ret = err_to_sderr(path, oid, errno); } close(fd); return ret; }
int default_remove_object(uint64_t oid, uint8_t ec_index) { char path[PATH_MAX]; get_store_path(oid, ec_index, path); if (unlink(path) < 0) { if (errno == ENOENT) return SD_RES_NO_OBJ; return err_to_sderr(path, oid, errno); } return SD_RES_SUCCESS; }
int default_create_and_write(uint64_t oid, const struct siocb *iocb) { char path[PATH_MAX], tmp_path[PATH_MAX]; int flags = prepare_iocb(oid, iocb, true); int ret, fd; uint32_t len = iocb->length; bool ec = is_erasure_obj(oid, iocb->copy_policy); size_t obj_size; sd_debug("%"PRIx64, oid); get_obj_path(oid, path, sizeof(path)); get_tmp_obj_path(oid, tmp_path, sizeof(tmp_path)); if (uatomic_is_true(&sys->use_journal) && journal_write_store(oid, iocb->buf, iocb->length, iocb->offset, true) != SD_RES_SUCCESS) { sd_err("turn off journaling"); uatomic_set_false(&sys->use_journal); flags |= O_DSYNC; sync(); } fd = open(tmp_path, flags, sd_def_fmode); if (fd < 0) { if (errno == EEXIST) { /* * This happens if node membership changes during object * creation; while gateway retries a CREATE request, * recovery process could also recover the object at the * same time. They should try to write the same date, * so it is okay to simply return success here. */ sd_debug("%s exists", tmp_path); return SD_RES_SUCCESS; } sd_err("failed to open %s: %m", tmp_path); return err_to_sderr(path, oid, errno); } if (ec) { uint8_t policy = iocb->copy_policy ?: get_vdi_copy_policy(oid_to_vid(oid)); int d; ec_policy_to_dp(policy, &d, NULL); obj_size = SD_DATA_OBJ_SIZE / d; } else
int default_link(uint64_t oid, struct siocb *iocb, uint32_t tgt_epoch) { char path[PATH_MAX], stale_path[PATH_MAX]; dprintf("try link %"PRIx64" from snapshot with epoch %d\n", oid, tgt_epoch); get_obj_path(oid, path); get_stale_obj_path(oid, tgt_epoch, stale_path); if (link(stale_path, path) < 0) { eprintf("failed to link from %s to %s, %m\n", stale_path, path); return err_to_sderr(oid, errno); } return SD_RES_SUCCESS; }
static int write_last_sector(int fd, uint32_t length) { const int size = SECTOR_SIZE; char *buf; int ret; off_t off = length - size; buf = valloc(size); if (!buf) { eprintf("failed to allocate memory\n"); return SD_RES_NO_MEM; } memset(buf, 0, size); ret = xpwrite(fd, buf, size, off); if (ret != size) ret = err_to_sderr(0, errno); /* FIXME: set oid */ else ret = SD_RES_SUCCESS; free(buf); return ret; }
int default_link(uint64_t oid, uint32_t tgt_epoch) { char path[PATH_MAX], stale_path[PATH_MAX]; sd_debug("try link %"PRIx64" from snapshot with epoch %d", oid, tgt_epoch); snprintf(path, PATH_MAX, "%s/%016"PRIx64, md_get_object_dir(oid), oid); get_store_stale_path(oid, tgt_epoch, 0, stale_path); if (link(stale_path, path) < 0) { /* * Recovery thread and main thread might try to recover the * same object and we might get EEXIST in such case. */ if (errno == EEXIST) goto out; sd_debug("failed to link from %s to %s, %m", stale_path, path); return err_to_sderr(path, oid, errno); } out: return SD_RES_SUCCESS; }
int default_create_and_write(uint64_t oid, const struct siocb *iocb) { char path[PATH_MAX], tmp_path[PATH_MAX]; int flags = prepare_iocb(oid, iocb, true); int ret, fd; uint32_t len = iocb->length; size_t obj_size; uint64_t offset = iocb->offset; sd_debug("%"PRIx64, oid); get_store_path(oid, iocb->ec_index, path); get_store_tmp_path(oid, iocb->ec_index, tmp_path); if (uatomic_is_true(&sys->use_journal) && journal_write_store(oid, iocb->buf, iocb->length, iocb->offset, true) != SD_RES_SUCCESS) { sd_err("turn off journaling"); uatomic_set_false(&sys->use_journal); flags |= O_DSYNC; sync(); } fd = open(tmp_path, flags, sd_def_fmode); if (fd < 0) { if (errno == EEXIST) { /* * This happens if node membership changes during object * creation; while gateway retries a CREATE request, * recovery process could also recover the object at the * same time. They should try to write the same date, * so it is okay to simply return success here. */ sd_debug("%s exists", tmp_path); return SD_RES_SUCCESS; } sd_err("failed to open %s: %m", tmp_path); return err_to_sderr(path, oid, errno); } obj_size = get_store_objsize(oid); trim_zero_blocks(iocb->buf, &offset, &len); if (offset != 0 || len != get_objsize(oid)) { if (is_sparse_object(oid)) ret = xftruncate(fd, obj_size); else ret = prealloc(fd, obj_size); if (ret < 0) { ret = err_to_sderr(path, oid, errno); goto out; } } ret = xpwrite(fd, iocb->buf, len, offset); if (ret != len) { sd_err("failed to write object. %m"); ret = err_to_sderr(path, oid, errno); goto out; } ret = rename(tmp_path, path); if (ret < 0) { sd_err("failed to rename %s to %s: %m", tmp_path, path); ret = err_to_sderr(path, oid, errno); goto out; } ret = SD_RES_SUCCESS; objlist_cache_insert(oid); out: if (ret != SD_RES_SUCCESS) unlink(tmp_path); close(fd); return ret; }
int default_create_and_write(uint64_t oid, const struct siocb *iocb) { char path[PATH_MAX], tmp_path[PATH_MAX]; int flags = get_open_flags(oid, true, iocb->flags); int ret, fd; uint32_t len = iocb->length; get_obj_path(oid, path); get_tmp_obj_path(oid, tmp_path); if (uatomic_is_true(&sys->use_journal) && journal_file_write(oid, iocb->buf, iocb->length, iocb->offset, true) != SD_RES_SUCCESS) { sd_eprintf("turn off journaling"); uatomic_set_false(&sys->use_journal); flags |= O_DSYNC; sync(); } fd = open(tmp_path, flags, def_fmode); if (fd < 0) { if (errno == EEXIST) { /* * This happens if node membership changes during object * creation; while gateway retries a CREATE request, * recovery process could also recover the object at the * same time. They should try to write the same date, * so it is okay to simply return success here. */ sd_dprintf("%s exists", tmp_path); return SD_RES_SUCCESS; } sd_eprintf("failed to open %s: %m", tmp_path); return err_to_sderr(oid, errno); } if (iocb->offset != 0 || iocb->length != get_objsize(oid)) { ret = prealloc(fd, get_objsize(oid)); if (ret < 0) { ret = err_to_sderr(oid, errno); goto out; } } ret = xpwrite(fd, iocb->buf, len, iocb->offset); if (ret != len) { sd_eprintf("failed to write object. %m"); ret = err_to_sderr(oid, errno); goto out; } ret = rename(tmp_path, path); if (ret < 0) { sd_eprintf("failed to rename %s to %s: %m", tmp_path, path); ret = err_to_sderr(oid, errno); goto out; } sd_dprintf("%"PRIx64, oid); ret = SD_RES_SUCCESS; out: if (ret != SD_RES_SUCCESS) unlink(tmp_path); close(fd); return ret; }
static int farm_read(uint64_t oid, struct siocb *iocb) { int flags = def_open_flags, fd, ret = SD_RES_SUCCESS; uint32_t epoch = sys_epoch(); char path[PATH_MAX]; ssize_t size; int i; void *buffer; if (iocb->epoch < epoch) { buffer = read_working_object(oid, iocb->offset, iocb->length); if (!buffer) { /* Here if read the object from the targeted epoch failed, * we need to read from the later epoch, because at some epoch * we doesn't write the object to the snapshot, we assume * it in the current local object directory, but maybe * in the next epoch we removed it from the local directory. * in this case, we should try to retrieve object upwards, since. * when the object is to be removed, it will get written to the * snapshot at later epoch. */ for (i = iocb->epoch; i < epoch; i++) { buffer = retrieve_object_from_snap(oid, i); if (buffer) break; } } if (!buffer) return SD_RES_NO_OBJ; memcpy(iocb->buf, buffer, iocb->length); free(buffer); return SD_RES_SUCCESS; } if (!is_data_obj(oid)) flags &= ~O_DIRECT; sprintf(path, "%s%016"PRIx64, obj_path, oid); fd = open(path, flags); if (fd < 0) return err_to_sderr(oid, errno); if (flock(fd, LOCK_SH) < 0) { ret = SD_RES_EIO; eprintf("%m\n"); goto out; } size = xpread(fd, iocb->buf, iocb->length, iocb->offset); if (flock(fd, LOCK_UN) < 0) { ret = SD_RES_EIO; eprintf("%m\n"); goto out; } if (size != iocb->length) { ret = SD_RES_EIO; goto out; } out: close(fd); return ret; }
int default_create_and_write(uint64_t oid, const struct siocb *iocb) { char path[PATH_MAX], tmp_path[PATH_MAX]; int flags = prepare_iocb(oid, iocb, true); int ret, fd; uint32_t len = iocb->length; size_t obj_size; sd_debug("%"PRIx64, oid); get_store_path(oid, iocb->ec_index, path); get_store_tmp_path(oid, iocb->ec_index, tmp_path); fd = open(tmp_path, flags, sd_def_fmode); if (fd < 0) { if (errno == EEXIST) { /* * This happens if node membership changes during object * creation; while gateway retries a CREATE request, * recovery process could also recover the object at the * same time. They should try to write the same date, * so it is okay to simply return success here. */ sd_debug("%s exists", tmp_path); return SD_RES_SUCCESS; } sd_err("failed to open %s: %m", tmp_path); return err_to_sderr(path, oid, errno); } obj_size = get_store_objsize(oid); ret = prealloc(fd, obj_size); if (ret < 0) { ret = err_to_sderr(path, oid, errno); goto out; } ret = xpwrite(fd, iocb->buf, len, iocb->offset); if (ret != len) { sd_err("failed to write object. %m"); ret = err_to_sderr(path, oid, errno); goto out; } /* * Modern FS like ext4, xfs defaults to automatic syncing of files after * replace-via-rename and replace-via-truncate operations. So rename * without fsync() is actually safe. */ ret = rename(tmp_path, path); if (ret < 0) { sd_err("failed to rename %s to %s: %m", tmp_path, path); ret = err_to_sderr(path, oid, errno); goto out; } ret = SD_RES_SUCCESS; objlist_cache_insert(oid); out: if (ret != SD_RES_SUCCESS && unlink(tmp_path) != 0) sd_err("failed to unlink %s: %m", tmp_path); close(fd); return ret; }