static int init_vdi_copy_number(uint64_t oid, char *wd) { char path[PATH_MAX]; int fd, flags = get_open_flags(oid, false, 0), ret; struct sheepdog_inode *inode = xzalloc(sizeof(*inode)); snprintf(path, sizeof(path), "%s/%016"PRIx64, wd, oid); fd = open(path, flags); if (fd < 0) { sd_eprintf("failed to open %s, %m", path); ret = SD_RES_EIO; goto out; } ret = xpread(fd, inode, SD_INODE_HEADER_SIZE, 0); if (ret != SD_INODE_HEADER_SIZE) { sd_eprintf("failed to read inode header, path=%s, %m", path); ret = SD_RES_EIO; goto out; } add_vdi_copy_number(oid_to_vid(oid), inode->nr_copies); ret = SD_RES_SUCCESS; out: free(inode); return SD_RES_SUCCESS; }
static int update_epoch_from_v0_to_v1(uint32_t epoch) { char path[PATH_MAX]; struct sd_node_v0 nodes_v0[SD_MAX_NODES]; struct sd_node_v1 nodes_v1[SD_MAX_NODES]; size_t nr_nodes; time_t *t; int len, fd, ret; snprintf(path, sizeof(path), "%s%08u", epoch_path, epoch); fd = open(path, O_RDWR | O_DSYNC); if (fd < 0) { if (errno == ENOENT) return 0; sd_eprintf("failed to open epoch %"PRIu32" log", epoch); return -1; } ret = xread(fd, nodes_v0, sizeof(nodes_v0)); if (ret < 0) { sd_eprintf("failed to read epoch %"PRIu32" log", epoch); close(fd); return ret; } nr_nodes = ret / sizeof(nodes_v0[0]); for (int i = 0; i < nr_nodes; i++) { memcpy(&nodes_v1[i].nid, &nodes_v0[i].nid, sizeof(struct node_id_v1)); nodes_v1[i].nr_vnodes = nodes_v0[i].nr_vnodes; nodes_v1[i].zone = nodes_v0[i].zone; nodes_v1[i].space = 0; } len = sizeof(nodes_v1[0]) * nr_nodes; ret = xpwrite(fd, nodes_v1, len, 0); if (ret != len) { sd_eprintf("failed to write epoch %"PRIu32" log", epoch); close(fd); return -1; } t = (time_t *)&nodes_v0[nr_nodes]; ret = xpwrite(fd, t, sizeof(*t), len); if (ret != sizeof(*t)) { sd_eprintf("failed to write time to epoch %" PRIu32" log", epoch); close(fd); return -1; } close(fd); return 0; }
/* copy file from 'fname' to 'fname.suffix' */ static int backup_file(char *fname, char *suffix) { char dst_file[PATH_MAX]; int fd = -1, ret = -1, len; void *buf = NULL; snprintf(dst_file, sizeof(dst_file), "%s.%s", fname, suffix); fd = open(fname, O_RDONLY); if (fd < 0) { if (errno != ENOENT) { sd_eprintf("failed to open %s, %m", fname); ret = -1; } else ret = 0; goto out; } len = get_file_size(fname); if (len < 0) goto out; buf = xmalloc(len); ret = xread(fd, buf, len); if (ret != len) { sd_eprintf("failed to read %s, %d %m", fname, ret); ret = -1; goto out; } close(fd); fd = open(dst_file, O_CREAT | O_WRONLY | O_DSYNC, 0644); if (fd < 0) { sd_eprintf("failed to create %s, %m", dst_file); ret = -1; goto out; } ret = xwrite(fd, buf, len); if (ret != len) { sd_eprintf("failed to write to %s, %d %m", dst_file, ret); ret = -1; } out: if (fd >= 0) close(fd); free(buf); return ret; }
static int migrate_from_v0_to_v1(void) { int ret, fd; struct sheepdog_config_v1 config; fd = open(config_path, O_RDWR); if (fd < 0) { sd_eprintf("failed to open config file, %m"); return -1; } memset(&config, 0, sizeof(config)); ret = xread(fd, &config, sizeof(config)); if (ret < 0) { sd_eprintf("failed to read config file, %m"); close(fd); return ret; } config.version = 1; ret = xpwrite(fd, &config, sizeof(config), 0); if (ret != sizeof(config)) { sd_eprintf("failed to write config data, %m"); close(fd); return -1; } /* 0.5.1 could wrongly extend the config file, so truncate it here */ ret = ftruncate(fd, sizeof(config)); if (ret != 0) { sd_eprintf("failed to truncate config data, %m"); close(fd); return -1; } close(fd); /* * If the config file contains a space field, the store layout * is compatible with v1. In this case, what we need to do is * only adding version number to the config file. */ if (config.space > 0) return 0; /* upgrade epoch log */ for_each_epoch(update_epoch_from_v0_to_v1); return ret; }
int send_req(int sockfd, struct sd_req *hdr, void *data, unsigned int wlen, bool (*need_retry)(uint32_t epoch), uint32_t epoch) { int ret; struct msghdr msg; struct iovec iov[2]; memset(&msg, 0, sizeof(msg)); msg.msg_iov = iov; msg.msg_iovlen = 1; iov[0].iov_base = hdr; iov[0].iov_len = sizeof(*hdr); if (wlen) { msg.msg_iovlen++; iov[1].iov_base = data; iov[1].iov_len = wlen; } ret = do_write(sockfd, &msg, sizeof(*hdr) + wlen, need_retry, epoch); if (ret) { sd_eprintf("failed to send request %x, %d: %m", hdr->opcode, wlen); ret = -1; } return ret; }
static int do_write(int sockfd, struct msghdr *msg, int len, bool (*need_retry)(uint32_t), uint32_t epoch) { int ret, repeat = MAX_RETRY_COUNT; rewrite: ret = sendmsg(sockfd, msg, 0); if (ret < 0) { if (errno == EINTR) goto rewrite; /* * Since we set timeout for write, we'll get EAGAIN even for * blocking sockfd. */ if (errno == EAGAIN && repeat && (need_retry == NULL || need_retry(epoch))) { repeat--; goto rewrite; } sd_eprintf("failed to write to socket: %m"); return 1; } len -= ret; if (len) { forward_iov(msg, ret); goto rewrite; } return 0; }
int set_nonblocking(int fd) { int ret; ret = fcntl(fd, F_GETFL); if (ret < 0) { sd_eprintf("fcntl F_GETFL failed: %m"); close(fd); } else { ret = fcntl(fd, F_SETFL, ret | O_NONBLOCK); if (ret < 0) sd_eprintf("fcntl O_NONBLOCK failed: %m"); } return ret; }
static int init_obj_path(const char *base_path, char *argp) { char *p; int len; if (check_path_len(base_path) < 0) return -1; #define OBJ_PATH "/obj" len = strlen(base_path) + strlen(OBJ_PATH) + 1; obj_path = xzalloc(len); snprintf(obj_path, len, "%s" OBJ_PATH, base_path); /* Eat up the first component */ strtok(argp, ","); p = strtok(NULL, ","); if (!p) { /* * If We have only one path, meta-store and object-store share * it. This is helpful to upgrade old sheep cluster to * the MD-enabled. */ md_add_disk(obj_path); } else { do { if (is_meta_store(p)) { sd_eprintf("%s is meta-store, abort", p); return -1; } md_add_disk(p); } while ((p = strtok(NULL, ","))); } return xmkdir(obj_path, sd_def_dmode); }
int sheep_exec_req(const struct node_id *nid, struct sd_req *hdr, void *buf) { struct sd_rsp *rsp = (struct sd_rsp *)hdr; struct sockfd *sfd; int ret; assert(is_worker_thread()); sfd = sockfd_cache_get(nid); if (!sfd) return SD_RES_NETWORK_ERROR; ret = exec_req(sfd->fd, hdr, buf, sheep_need_retry, hdr->epoch, MAX_RETRY_COUNT); if (ret) { sd_dprintf("remote node might have gone away"); sockfd_cache_del(nid, sfd); return SD_RES_NETWORK_ERROR; } ret = rsp->result; if (ret != SD_RES_SUCCESS) sd_eprintf("failed %s", sd_strerror(ret)); sockfd_cache_put(nid, sfd); return ret; }
static int do_shepherd_join(void) { int ret, msg_join_len; struct sph_msg msg; struct sph_msg_join *msg_join; msg_join_len = sizeof(struct sph_msg_join) + kept_opaque_len; memset(&msg, 0, sizeof(msg)); msg.type = SPH_CLI_MSG_JOIN; msg.body_len = msg_join_len; msg_join = xzalloc(msg_join_len); msg_join->node = this_node; memcpy(msg_join->opaque, kept_opaque, kept_opaque_len); ret = writev2(sph_comm_fd, &msg, msg_join, msg_join_len); if (sizeof(msg) + msg_join_len != ret) { sd_eprintf("do_shepherd_join() failed, %m"); free(msg_join); return -1; } free(msg_join); return 0; }
/* * Recover the object from its track in epoch history. That is, * the routine will try to recovery it from the nodes it has stayed, * at least, *theoretically* on consistent hash ring. */ static int do_recover_object(struct recovery_work *rw) { struct vnode_info *old; uint64_t oid = rw->oids[rw->done]; uint32_t epoch = rw->epoch, tgt_epoch = rw->epoch - 1; int nr_copies, ret, i; old = grab_vnode_info(rw->old_vinfo); again: sd_dprintf("try recover object %"PRIx64" from epoch %"PRIu32, oid, tgt_epoch); /* Let's do a breadth-first search */ nr_copies = get_obj_copy_number(oid, old->nr_zones); for (i = 0; i < nr_copies; i++) { const struct sd_vnode *tgt_vnode; tgt_vnode = oid_to_vnode(old->vnodes, old->nr_vnodes, oid, i); if (is_invalid_vnode(tgt_vnode, rw->cur_vinfo->nodes, rw->cur_vinfo->nr_nodes)) continue; ret = recover_object_from_replica(oid, tgt_vnode, epoch, tgt_epoch); if (ret == SD_RES_SUCCESS) { /* Succeed */ break; } else if (SD_RES_OLD_NODE_VER == ret) { rw->stop = true; goto err; } else ret = -1; } /* No luck, roll back to an older configuration and try again */ if (ret < 0) { struct vnode_info *new_old; rollback: tgt_epoch--; if (tgt_epoch < 1) { sd_eprintf("can not recover oid %"PRIx64, oid); ret = -1; goto err; } new_old = get_vnode_info_epoch(tgt_epoch); if (!new_old) /* We rollback in case we don't get a valid epoch */ goto rollback; put_vnode_info(old); old = new_old; goto again; } err: put_vnode_info(old); return ret; }
static struct vdi_op_message *prepare_cluster_msg(struct request *req, size_t *sizep) { struct vdi_op_message *msg; size_t size; if (has_process_main(req->op) && req->rq.flags & SD_FLAG_CMD_WRITE) size = sizeof(*msg) + req->rq.data_length; else size = sizeof(*msg); assert(size <= SD_MAX_EVENT_BUF_SIZE); msg = zalloc(size); if (!msg) { sd_eprintf("failed to allocate memory\n"); return NULL; } memcpy(&msg->req, &req->rq, sizeof(struct sd_req)); memcpy(&msg->rsp, &req->rp, sizeof(struct sd_rsp)); if (has_process_main(req->op) && req->rq.flags & SD_FLAG_CMD_WRITE) memcpy(msg->data, req->data, req->rq.data_length); *sizep = size; return msg; }
static int create_journal_file(const char *root, const char *name) { int fd, flags = O_DSYNC | O_RDWR | O_TRUNC | O_CREAT | O_DIRECT; char path[PATH_MAX]; snprintf(path, sizeof(path), "%s/%s", root, name); fd = open(path, flags, 0644); if (fd < 0) { sd_eprintf("open %s %m", name); return -1; } if (prealloc(fd, jfile_size) < 0) { sd_eprintf("prealloc %s %m", name); return -1; } return fd; }
static int restore_objects_from_snap(uint32_t epoch) { struct sha1_file_hdr hdr; struct trunk_entry *trunk_buf, *trunk_free = NULL; unsigned char trunk_sha1[SHA1_LEN]; uint64_t nr_trunks, i; int ret = SD_RES_EIO; if (get_trunk_sha1(epoch, trunk_sha1) < 0) goto out; trunk_free = trunk_buf = trunk_file_read(trunk_sha1, &hdr); if (!trunk_buf) goto out; nr_trunks = hdr.priv; ret = SD_RES_SUCCESS; for (i = 0; i < nr_trunks; i++, trunk_buf++) { struct sha1_file_hdr h; struct siocb io = { 0 }; uint64_t oid; void *buffer = NULL; oid = trunk_buf->oid; buffer = sha1_file_read(trunk_buf->sha1, &h); if (!buffer) { sd_eprintf("oid %"PRIx64" not restored", oid); goto out; } io.length = h.size; io.buf = buffer; ret = default_create_and_write(oid, &io); if (ret != SD_RES_SUCCESS) { sd_eprintf("oid %"PRIx64" not restored", oid); goto out; } else sd_dprintf("oid %"PRIx64" restored", oid); free(buffer); } out: free(trunk_free); return ret; }
int default_cleanup(void) { rmdir_r(stale_dir); if (mkdir(stale_dir, 0755) < 0) { sd_eprintf("%m\n"); return SD_RES_EIO; } return SD_RES_SUCCESS; }
static void read_msg(struct sph_msg *rcv) { int ret; ret = xread(sph_comm_fd, rcv, sizeof(*rcv)); if (ret != sizeof(*rcv)) { sd_eprintf("xread() failed: %m"); exit(1); } }
static inline int check_path_len(const char *path) { int len = strlen(path); if (len > PATH_MAX) { sd_eprintf("insanely long object directory %s", path); return -1; } return 0; }
static void recover_object_work(struct work *work) { struct recovery_work *rw = container_of(work, struct recovery_work, work); uint64_t oid = rw->oids[rw->done]; int ret; sd_eprintf("done:%"PRIu32" count:%"PRIu32", oid:%"PRIx64, rw->done, rw->count, oid); if (sd_store->exist(oid)) { sd_dprintf("the object is already recovered"); return; } ret = do_recover_object(rw); if (ret < 0) sd_eprintf("failed to recover object %"PRIx64, oid); }
static inline bool md_access(char *path) { if (access(path, R_OK | W_OK) < 0) { if (errno != ENOENT) sd_eprintf("failed to check %s, %m", path); return false; } return true; }
int get_local_addr(uint8_t *bytes) { struct ifaddrs *ifaddr, *ifa; int ret = 0; if (getifaddrs(&ifaddr) == -1) { sd_eprintf("getifaddrs failed: %m"); return -1; } for (ifa = ifaddr; ifa; ifa = ifa->ifa_next) { struct sockaddr_in *sin; struct sockaddr_in6 *sin6; if (ifa->ifa_flags & IFF_LOOPBACK) continue; if (!ifa->ifa_addr) continue; switch (ifa->ifa_addr->sa_family) { case AF_INET: sin = (struct sockaddr_in *)ifa->ifa_addr; memset(bytes, 0, 12); memcpy(bytes + 12, &sin->sin_addr, 4); memcpy(bytes + 12, &sin->sin_addr, 4); sd_eprintf("found IPv4 address"); goto out; case AF_INET6: sin6 = (struct sockaddr_in6 *)ifa->ifa_addr; memcpy(bytes, &sin6->sin6_addr, 16); sd_eprintf("found IPv6 address"); goto out; } } sd_eprintf("no valid interface found"); ret = -1; out: freeifaddrs(ifaddr); return ret; }
static int make_stale_dir(char *path) { char p[PATH_MAX]; snprintf(p, PATH_MAX, "%s/.stale", path); if (xmkdir(p, def_dmode) < 0) { sd_eprintf("%s failed, %m", p); return SD_RES_EIO; } return SD_RES_SUCCESS; }
int default_flush(void) { int fd, ret = SD_RES_SUCCESS; fd = open(obj_path, O_RDONLY); if (fd < 0) { sd_eprintf("error at open() %s, %s\n", obj_path, strerror(errno)); return SD_RES_NO_OBJ; } if (syncfs(fd)) { sd_eprintf("error at syncfs(), %s\n", strerror(errno)); ret = SD_RES_EIO; } close(fd); return ret; }
int default_format(void) { unsigned ret; sd_dprintf("try get a clean store\n"); ret = rmdir_r(obj_path); if (ret && ret != -ENOENT) { sd_eprintf("failed to remove %s: %s\n", obj_path, strerror(-ret)); return SD_RES_EIO; } if (mkdir(obj_path, def_dmode) < 0) { sd_eprintf("%m\n"); return SD_RES_EIO; } if (is_object_cache_enabled()) object_cache_format(); return SD_RES_SUCCESS; }
int default_write(uint64_t oid, const struct siocb *iocb) { int flags = get_open_flags(oid, false, iocb->flags), fd, ret = SD_RES_SUCCESS; char path[PATH_MAX]; ssize_t size; if (iocb->epoch < sys_epoch()) { sd_dprintf("%"PRIu32" sys %"PRIu32"\n", iocb->epoch, sys_epoch()); return SD_RES_OLD_NODE_VER; } get_obj_path(oid, path); if (uatomic_is_true(&sys->use_journal) && journal_file_write(oid, iocb->buf, iocb->length, iocb->offset, false) != SD_RES_SUCCESS) { sd_eprintf("turn off journaling\n"); uatomic_set_false(&sys->use_journal); flags |= O_DSYNC; sync(); } fd = open(path, flags, def_fmode); if (fd < 0) return err_to_sderr(oid, errno); size = xpwrite(fd, iocb->buf, iocb->length, iocb->offset); if (size != iocb->length) { sd_eprintf("failed to write object %"PRIx64", path=%s, offset=%" PRId64", size=%"PRId32", result=%zd, %m\n", oid, path, iocb->offset, iocb->length, size); ret = err_to_sderr(oid, errno); goto out; } out: close(fd); return ret; }
int create_unix_domain_socket(const char *unix_path, int (*callback)(int, void *), void *data) { int fd, ret; struct sockaddr_un addr; addr.sun_family = AF_UNIX; pstrcpy(addr.sun_path, sizeof(addr.sun_path), unix_path); fd = socket(addr.sun_family, SOCK_STREAM, 0); if (fd < 0) { sd_eprintf("failed to create socket, %m"); return -1; } ret = bind(fd, &addr, sizeof(addr)); if (ret) { sd_eprintf("failed to bind socket: %m"); goto err; } ret = listen(fd, SOMAXCONN); if (ret) { sd_eprintf("failed to listen on socket: %m"); goto err; } ret = set_nonblocking(fd); if (ret < 0) goto err; ret = callback(fd, data); if (ret) goto err; return 0; err: close(fd); return -1; }
static size_t get_file_size(const char *path) { struct stat stbuf; int ret; ret = stat(path, &stbuf); if (ret < 0) { sd_eprintf("failed to stat %s, %m", path); return -1; } return stbuf.st_size; }
bool inetaddr_is_valid(char *addr) { unsigned char buf[INET6_ADDRSTRLEN]; int af; af = strstr(addr, ":") ? AF_INET6 : AF_INET; if (!inet_pton(af, addr, buf)) { sd_eprintf("Bad address '%s'", addr); return false; } return true; }
int err_to_sderr(uint64_t oid, int err) { struct stat s; switch (err) { case ENOENT: if (stat(get_object_path(oid), &s) < 0) { sd_eprintf("corrupted"); return SD_RES_EIO; } sd_dprintf("object %016" PRIx64 " not found locally", oid); return SD_RES_NO_OBJ; case ENOSPC: /* TODO: stop automatic recovery */ sd_eprintf("diskfull, oid=%"PRIx64, oid); return SD_RES_NO_SPACE; default: sd_eprintf("oid=%"PRIx64", %m", oid); return SD_RES_EIO; } }
static int jrnl_create(struct jrnl_descriptor *jd, const char *jrnl_dir) { snprintf(jd->path, sizeof(jd->path), "%sXXXXXX", jrnl_dir); jd->fd = mkostemp(jd->path, O_DSYNC); if (jd->fd < 0) { sd_eprintf("failed to create %s: %m", jd->path); return SD_RES_UNKNOWN; } return SD_RES_SUCCESS; }
static int write_config(void) { int ret; ret = atomic_create_and_write(config_path, (char *)&config, sizeof(config)); if (ret < 0) { sd_eprintf("atomic_create_and_write() failed"); return SD_RES_EIO; } return SD_RES_SUCCESS; }