int default_write(uint64_t oid, struct siocb *iocb) { int flags = get_open_flags(oid, false), fd, ret = SD_RES_SUCCESS; char path[PATH_MAX]; ssize_t size; if (iocb->epoch < sys_epoch()) { dprintf("%"PRIu32" sys %"PRIu32"\n", iocb->epoch, sys_epoch()); return SD_RES_OLD_NODE_VER; } get_obj_path(oid, path); if (iocb->flags & SD_FLAG_CMD_CACHE && is_disk_cache_enabled()) flags &= ~O_DSYNC; fd = open(path, flags, def_fmode); if (fd < 0) return err_to_sderr(oid, errno); size = xpwrite(fd, iocb->buf, iocb->length, iocb->offset); if (size != iocb->length) { eprintf("failed to write object %"PRIx64", path=%s, offset=%" PRId64", size=%"PRId32", result=%zd, %m\n", oid, path, iocb->offset, iocb->length, size); ret = err_to_sderr(oid, errno); goto out; } out: close(fd); return ret; }
int default_write(uint64_t oid, const struct siocb *iocb) { int flags = prepare_iocb(oid, iocb, false), fd, ret = SD_RES_SUCCESS; char path[PATH_MAX]; ssize_t size; uint32_t len = iocb->length; uint64_t offset = iocb->offset; static bool trim_is_supported = true; if (iocb->epoch < sys_epoch()) { sd_debug("%"PRIu32" sys %"PRIu32, iocb->epoch, sys_epoch()); return SD_RES_OLD_NODE_VER; } if (uatomic_is_true(&sys->use_journal) && unlikely(journal_write_store(oid, iocb->buf, iocb->length, iocb->offset, false)) != SD_RES_SUCCESS) { sd_err("turn off journaling"); uatomic_set_false(&sys->use_journal); flags |= O_DSYNC; sync(); } get_store_path(oid, iocb->ec_index, path); /* * Make sure oid is in the right place because oid might be misplaced * in a wrong place, due to 'shutdown/restart with less/more disks' or * any bugs. We need call err_to_sderr() to return EIO if disk is broken */ if (!default_exist(oid, iocb->ec_index)) return err_to_sderr(path, oid, ENOENT); fd = open(path, flags, sd_def_fmode); if (unlikely(fd < 0)) return err_to_sderr(path, oid, errno); if (trim_is_supported && is_sparse_object(oid)) { if (default_trim(fd, oid, iocb, &offset, &len) < 0) { trim_is_supported = false; offset = iocb->offset; len = iocb->length; } } size = xpwrite(fd, iocb->buf, len, offset); if (unlikely(size != len)) { sd_err("failed to write object %"PRIx64", path=%s, offset=%" PRId32", size=%"PRId32", result=%zd, %m", oid, path, iocb->offset, iocb->length, size); ret = err_to_sderr(path, oid, errno); goto out; } out: close(fd); return ret; }
static int farm_write(uint64_t oid, struct siocb *iocb, int create) { int flags = def_open_flags, fd, ret = SD_RES_SUCCESS; char path[PATH_MAX]; ssize_t size; if (iocb->epoch < sys_epoch()) { dprintf("%"PRIu32" sys %"PRIu32"\n", iocb->epoch, sys_epoch()); return SD_RES_OLD_NODE_VER; } if (!is_data_obj(oid)) flags &= ~O_DIRECT; if (create) flags |= O_CREAT | O_TRUNC; sprintf(path, "%s%016"PRIx64, obj_path, oid); fd = open(path, flags, def_fmode); if (fd < 0) return err_to_sderr(oid, errno); if (flock(fd, LOCK_EX) < 0) { ret = SD_RES_EIO; eprintf("%m\n"); goto out; } if (create && !(iocb->flags & SD_FLAG_CMD_COW)) { ret = prealloc(fd, get_objsize(oid)); if (ret != SD_RES_SUCCESS) { if (flock(fd, LOCK_UN) < 0) { ret = SD_RES_EIO; eprintf("%m\n"); goto out; } goto out; } } size = xpwrite(fd, iocb->buf, iocb->length, iocb->offset); if (flock(fd, LOCK_UN) < 0) { ret = SD_RES_EIO; eprintf("%m\n"); goto out; } if (size != iocb->length) { eprintf("%m\n"); ret = SD_RES_EIO; goto out; } trunk_update_entry(oid); out: close(fd); return ret; }
/* * Wait for all forward requests completion. * * Even if something goes wrong, we have to wait forward requests completion to * avoid interleaved requests. * * Return error code if any one request fails. */ static int wait_forward_request(struct write_info *wi, struct request *req) { int nr_sent, err_ret = SD_RES_SUCCESS, ret, pollret, i; struct pfd_info pi; struct sd_rsp *rsp = &req->rp; again: pfd_info_init(wi, &pi); pollret = poll(pi.pfds, pi.nr, 5000); if (pollret < 0) { if (errno == EINTR) goto again; panic("%m\n"); } else if (pollret == 0) { eprintf("poll timeout %d\n", wi->nr_sent); if (req->rq.epoch == sys_epoch()) goto again; nr_sent = wi->nr_sent; /* XXX Blinedly close all the connections */ for (i = 0; i < nr_sent; i++) finish_one_write_err(wi, i); err_ret = SD_RES_NETWORK_ERROR; goto finish_write; } nr_sent = wi->nr_sent; for (i = 0; i < nr_sent; i++) if (pi.pfds[i].revents & POLLIN) break; if (i < nr_sent) { int re = pi.pfds[i].revents; dprintf("%d, revents %x\n", i, re); if (re & (POLLERR | POLLHUP | POLLNVAL)) { err_ret = SD_RES_NETWORK_ERROR; finish_one_write_err(wi, i); goto finish_write; } if (do_read(pi.pfds[i].fd, rsp, sizeof(*rsp))) { eprintf("remote node might have gone away\n"); err_ret = SD_RES_NETWORK_ERROR; finish_one_write_err(wi, i); goto finish_write; } ret = rsp->result; if (ret != SD_RES_SUCCESS) { eprintf("fail %"PRIx32"\n", ret); err_ret = ret; } finish_one_write(wi, i); } finish_write: if (wi->nr_sent > 0) goto again; return err_ret; }
static int farm_link(uint64_t oid, struct siocb *iocb, uint32_t tgt_epoch) { int ret = SD_RES_EIO; void *buf = NULL; struct siocb io = { 0 }; int i; uint32_t epoch = sys_epoch(); dprintf("try link %"PRIx64" from snapshot with epoch %d\n", oid, tgt_epoch); for (i = tgt_epoch; i < epoch; i++) { buf = retrieve_object_from_snap(oid, i); if (buf) break; } if (!buf) goto out; io.length = iocb->length; io.buf = buf; ret = farm_atomic_put(oid, &io); out: free(buf); return ret; }
int snap_file_write(uint32_t epoch, unsigned char *trunksha1, unsigned char *outsha1, int user) { int ret = 0; struct strbuf buf = STRBUF_INIT; struct sd_node nodes[SD_MAX_NODES]; int tgt_epoch = user ? sys_epoch() : epoch; uint64_t epoch_size; struct sha1_file_hdr hdr; epoch_size = epoch_log_read(tgt_epoch, (char *)nodes, sizeof(nodes)); if (epoch_size == -1) return -1; memcpy(hdr.tag, TAG_SNAP, TAG_LEN); hdr.size = epoch_size + SHA1_LEN; hdr.priv = tgt_epoch; hdr.reserved = 0; strbuf_add(&buf, &hdr, sizeof(hdr)); strbuf_add(&buf, trunksha1, SHA1_LEN); strbuf_add(&buf, (char *)nodes, epoch_size); if (sha1_file_write((void *)buf.buf, buf.len, outsha1) < 0) { ret = -1; goto err; } dprintf("epoch %u, sha1: %s\n", epoch, sha1_to_hex(outsha1)); err: strbuf_release(&buf); return ret; }
int default_write(uint64_t oid, const struct siocb *iocb) { int flags = get_open_flags(oid, false, iocb->flags), fd, ret = SD_RES_SUCCESS; char path[PATH_MAX]; ssize_t size; if (iocb->epoch < sys_epoch()) { sd_dprintf("%"PRIu32" sys %"PRIu32"\n", iocb->epoch, sys_epoch()); return SD_RES_OLD_NODE_VER; } get_obj_path(oid, path); if (uatomic_is_true(&sys->use_journal) && journal_file_write(oid, iocb->buf, iocb->length, iocb->offset, false) != SD_RES_SUCCESS) { sd_eprintf("turn off journaling\n"); uatomic_set_false(&sys->use_journal); flags |= O_DSYNC; sync(); } fd = open(path, flags, def_fmode); if (fd < 0) return err_to_sderr(oid, errno); size = xpwrite(fd, iocb->buf, iocb->length, iocb->offset); if (size != iocb->length) { sd_eprintf("failed to write object %"PRIx64", path=%s, offset=%" PRId64", size=%"PRId32", result=%zd, %m\n", oid, path, iocb->offset, iocb->length, size); ret = err_to_sderr(oid, errno); goto out; } out: close(fd); return ret; }
int default_write(uint64_t oid, const struct siocb *iocb) { int flags = prepare_iocb(oid, iocb, false), fd, ret = SD_RES_SUCCESS; char path[PATH_MAX]; ssize_t size; if (iocb->epoch < sys_epoch()) { sd_debug("%"PRIu32" sys %"PRIu32, iocb->epoch, sys_epoch()); return SD_RES_OLD_NODE_VER; } if (uatomic_is_true(&sys->use_journal) && unlikely(journal_write_store(oid, iocb->buf, iocb->length, iocb->offset, false)) != SD_RES_SUCCESS) { sd_err("turn off journaling"); uatomic_set_false(&sys->use_journal); flags |= O_DSYNC; sync(); } get_obj_path(oid, path); fd = open(path, flags, sd_def_fmode); if (unlikely(fd < 0)) return err_to_sderr(path, oid, errno); size = xpwrite(fd, iocb->buf, iocb->length, iocb->offset); if (unlikely(size != iocb->length)) { sd_err("failed to write object %"PRIx64", path=%s, offset=%" PRId64", size=%"PRId32", result=%zd, %m", oid, path, iocb->offset, iocb->length, size); ret = err_to_sderr(path, oid, errno); goto out; } out: close(fd); return ret; }
int default_write(uint64_t oid, const struct siocb *iocb) { int flags = prepare_iocb(oid, iocb, false), fd, ret = SD_RES_SUCCESS; char path[PATH_MAX]; ssize_t size; if (iocb->epoch < sys_epoch()) { sd_debug("%"PRIu32" sys %"PRIu32, iocb->epoch, sys_epoch()); return SD_RES_OLD_NODE_VER; } get_store_path(oid, iocb->ec_index, path); /* * Make sure oid is in the right place because oid might be misplaced * in a wrong place, due to 'shutdown/restart with less/more disks' or * any bugs. We need call err_to_sderr() to return EIO if disk is broken */ if (!default_exist(oid, iocb->ec_index)) return err_to_sderr(path, oid, errno); fd = open(path, flags, sd_def_fmode); if (unlikely(fd < 0)) return err_to_sderr(path, oid, errno); size = xpwrite(fd, iocb->buf, iocb->length, iocb->offset); if (unlikely(size != iocb->length)) { sd_err("failed to write object %"PRIx64", path=%s, offset=%" PRId32", size=%"PRId32", result=%zd, %m", oid, path, iocb->offset, iocb->length, size); ret = err_to_sderr(path, oid, errno); goto out; } out: close(fd); return ret; }
static int recover_object_from_replica(uint64_t oid, const struct sd_vnode *vnode, uint32_t epoch, uint32_t tgt_epoch) { struct sd_req hdr; struct sd_rsp *rsp = (struct sd_rsp *)&hdr; unsigned rlen; int ret = SD_RES_NO_MEM; void *buf = NULL; struct siocb iocb = { 0 }; if (vnode_is_local(vnode) && tgt_epoch < sys_epoch()) { ret = sd_store->link(oid, tgt_epoch); goto out; } rlen = get_objsize(oid); buf = valloc(rlen); if (!buf) { sd_eprintf("%m"); goto out; } sd_init_req(&hdr, SD_OP_READ_PEER); hdr.epoch = epoch; hdr.flags = SD_FLAG_CMD_RECOVERY; hdr.data_length = rlen; hdr.obj.oid = oid; hdr.obj.tgt_epoch = tgt_epoch; ret = sheep_exec_req(&vnode->nid, &hdr, buf); if (ret != SD_RES_SUCCESS) goto out; iocb.epoch = epoch; iocb.length = rsp->data_length; iocb.offset = rsp->obj.offset; iocb.buf = buf; ret = sd_store->create_and_write(oid, &iocb); out: if (ret == SD_RES_SUCCESS) { sd_dprintf("recovered oid %"PRIx64" from %d to epoch %d", oid, tgt_epoch, epoch); objlist_cache_insert(oid); } free(buf); return ret; }
int default_read(uint64_t oid, struct siocb *iocb) { int ret; char path[PATH_MAX]; uint32_t epoch = sys_epoch(); get_obj_path(oid, path); ret = default_read_from_path(oid, path, iocb); /* If the request is againt the older epoch, try to read from * the stale directory */ while (ret == SD_RES_NO_OBJ && iocb->epoch < epoch) { epoch--; get_stale_obj_path(oid, epoch, path); ret = default_read_from_path(oid, path, iocb); } return ret; }
int default_read(uint64_t oid, const struct siocb *iocb) { int ret; char path[PATH_MAX]; get_store_path(oid, iocb->ec_index, path); ret = default_read_from_path(oid, path, iocb); /* * If the request is againt the older epoch, try to read from * the stale directory */ if (ret == SD_RES_NO_OBJ && iocb->epoch > 0 && iocb->epoch < sys_epoch()) { get_store_stale_path(oid, iocb->epoch, iocb->ec_index, path); ret = default_read_from_path(oid, path, iocb); } return ret; }
/* Fetch the object list from all the nodes in the cluster */ static uint64_t *fetch_object_list(struct sd_node *e, uint32_t epoch, size_t *nr_oids) { char name[128]; struct sd_list_req hdr; struct sd_list_rsp *rsp = (struct sd_list_rsp *)&hdr; size_t buf_size = list_buffer_size; uint64_t *buf = xmalloc(buf_size); int ret; addr_to_str(name, sizeof(name), e->nid.addr, 0); sd_dprintf("%s %"PRIu32, name, e->nid.port); retry: sd_init_req((struct sd_req *)&hdr, SD_OP_GET_OBJ_LIST); hdr.tgt_epoch = epoch - 1; hdr.data_length = buf_size; hdr.epoch = sys_epoch(); ret = sheep_exec_req(&e->nid, (struct sd_req *)&hdr, buf); switch (ret) { case SD_RES_SUCCESS: break; case SD_RES_BUFFER_SMALL: buf_size *= 2; buf = xrealloc(buf, buf_size); goto retry; default: free(buf); return NULL; } *nr_oids = rsp->data_length / sizeof(uint64_t); sd_dprintf("%zu", *nr_oids); return buf; }
bool sheep_need_retry(uint32_t epoch) { return sys_epoch() == epoch; }
static int farm_read(uint64_t oid, struct siocb *iocb) { int flags = def_open_flags, fd, ret = SD_RES_SUCCESS; uint32_t epoch = sys_epoch(); char path[PATH_MAX]; ssize_t size; int i; void *buffer; if (iocb->epoch < epoch) { buffer = read_working_object(oid, iocb->offset, iocb->length); if (!buffer) { /* Here if read the object from the targeted epoch failed, * we need to read from the later epoch, because at some epoch * we doesn't write the object to the snapshot, we assume * it in the current local object directory, but maybe * in the next epoch we removed it from the local directory. * in this case, we should try to retrieve object upwards, since. * when the object is to be removed, it will get written to the * snapshot at later epoch. */ for (i = iocb->epoch; i < epoch; i++) { buffer = retrieve_object_from_snap(oid, i); if (buffer) break; } } if (!buffer) return SD_RES_NO_OBJ; memcpy(iocb->buf, buffer, iocb->length); free(buffer); return SD_RES_SUCCESS; } if (!is_data_obj(oid)) flags &= ~O_DIRECT; sprintf(path, "%s%016"PRIx64, obj_path, oid); fd = open(path, flags); if (fd < 0) return err_to_sderr(oid, errno); if (flock(fd, LOCK_SH) < 0) { ret = SD_RES_EIO; eprintf("%m\n"); goto out; } size = xpread(fd, iocb->buf, iocb->length, iocb->offset); if (flock(fd, LOCK_UN) < 0) { ret = SD_RES_EIO; eprintf("%m\n"); goto out; } if (size != iocb->length) { ret = SD_RES_EIO; goto out; } out: close(fd); return ret; }