Beispiel #1
0
static int init_vdi_copy_number(uint64_t oid, char *wd)
{
	char path[PATH_MAX];
	int fd, flags = get_open_flags(oid, false, 0), ret;
	struct sheepdog_inode *inode = xzalloc(sizeof(*inode));

	snprintf(path, sizeof(path), "%s/%016"PRIx64, wd, oid);

	fd = open(path, flags);
	if (fd < 0) {
		sd_eprintf("failed to open %s, %m", path);
		ret = SD_RES_EIO;
		goto out;
	}

	ret = xpread(fd, inode, SD_INODE_HEADER_SIZE, 0);
	if (ret != SD_INODE_HEADER_SIZE) {
		sd_eprintf("failed to read inode header, path=%s, %m", path);
		ret = SD_RES_EIO;
		goto out;
	}

	add_vdi_copy_number(oid_to_vid(oid), inode->nr_copies);

	ret = SD_RES_SUCCESS;
out:
	free(inode);
	return SD_RES_SUCCESS;
}
Beispiel #2
0
static int update_epoch_from_v0_to_v1(uint32_t epoch)
{
	char path[PATH_MAX];
	struct sd_node_v0 nodes_v0[SD_MAX_NODES];
	struct sd_node_v1 nodes_v1[SD_MAX_NODES];
	size_t nr_nodes;
	time_t *t;
	int len, fd, ret;

	snprintf(path, sizeof(path), "%s%08u", epoch_path, epoch);
	fd = open(path, O_RDWR | O_DSYNC);
	if (fd < 0) {
		if (errno == ENOENT)
			return 0;

		sd_eprintf("failed to open epoch %"PRIu32" log", epoch);
		return -1;
	}

	ret = xread(fd, nodes_v0, sizeof(nodes_v0));
	if (ret < 0) {
		sd_eprintf("failed to read epoch %"PRIu32" log", epoch);
		close(fd);
		return ret;
	}

	nr_nodes = ret / sizeof(nodes_v0[0]);
	for (int i = 0; i < nr_nodes; i++) {
		memcpy(&nodes_v1[i].nid, &nodes_v0[i].nid,
		       sizeof(struct node_id_v1));
		nodes_v1[i].nr_vnodes = nodes_v0[i].nr_vnodes;
		nodes_v1[i].zone = nodes_v0[i].zone;
		nodes_v1[i].space = 0;
	}

	len = sizeof(nodes_v1[0]) * nr_nodes;
	ret = xpwrite(fd, nodes_v1, len, 0);
	if (ret != len) {
		sd_eprintf("failed to write epoch %"PRIu32" log",
			   epoch);
		close(fd);
		return -1;
	}

	t = (time_t *)&nodes_v0[nr_nodes];

	ret = xpwrite(fd, t, sizeof(*t), len);
	if (ret != sizeof(*t)) {
		sd_eprintf("failed to write time to epoch %"
			   PRIu32" log", epoch);
		close(fd);
		return -1;
	}

	close(fd);

	return 0;
}
Beispiel #3
0
/* copy file from 'fname' to 'fname.suffix' */
static int backup_file(char *fname, char *suffix)
{
	char dst_file[PATH_MAX];
	int fd = -1, ret = -1, len;
	void *buf = NULL;

	snprintf(dst_file, sizeof(dst_file), "%s.%s", fname, suffix);

	fd = open(fname, O_RDONLY);
	if (fd < 0) {
		if (errno != ENOENT) {
			sd_eprintf("failed to open %s, %m", fname);
			ret = -1;
		} else
			ret = 0;
		goto out;
	}

	len = get_file_size(fname);
	if (len < 0)
		goto out;

	buf = xmalloc(len);
	ret = xread(fd, buf, len);
	if (ret != len) {
		sd_eprintf("failed to read %s, %d %m", fname, ret);
		ret = -1;
		goto out;
	}

	close(fd);

	fd = open(dst_file, O_CREAT | O_WRONLY | O_DSYNC, 0644);
	if (fd < 0) {
		sd_eprintf("failed to create %s, %m", dst_file);
		ret = -1;
		goto out;
	}

	ret = xwrite(fd, buf, len);
	if (ret != len) {
		sd_eprintf("failed to write to %s, %d %m", dst_file, ret);
		ret = -1;
	}
out:
	if (fd >= 0)
		close(fd);
	free(buf);

	return ret;
}
Beispiel #4
0
static int migrate_from_v0_to_v1(void)
{
	int ret, fd;
	struct sheepdog_config_v1 config;

	fd = open(config_path, O_RDWR);
	if (fd < 0) {
		sd_eprintf("failed to open config file, %m");
		return -1;
	}

	memset(&config, 0, sizeof(config));
	ret = xread(fd, &config, sizeof(config));
	if (ret < 0) {
		sd_eprintf("failed to read config file, %m");
		close(fd);
		return ret;
	}

	config.version = 1;
	ret = xpwrite(fd, &config, sizeof(config), 0);
	if (ret != sizeof(config)) {
		sd_eprintf("failed to write config data, %m");
		close(fd);
		return -1;
	}

	/* 0.5.1 could wrongly extend the config file, so truncate it here */
	ret = ftruncate(fd, sizeof(config));
	if (ret != 0) {
		sd_eprintf("failed to truncate config data, %m");
		close(fd);
		return -1;
	}

	close(fd);

	/*
	 * If the config file contains a space field, the store layout
	 * is compatible with v1.  In this case, what we need to do is
	 * only adding version number to the config file.
	 */
	if (config.space > 0)
		return 0;

	/* upgrade epoch log */
	for_each_epoch(update_epoch_from_v0_to_v1);

	return ret;
}
Beispiel #5
0
int send_req(int sockfd, struct sd_req *hdr, void *data, unsigned int wlen,
	     bool (*need_retry)(uint32_t epoch), uint32_t epoch)
{
	int ret;
	struct msghdr msg;
	struct iovec iov[2];

	memset(&msg, 0, sizeof(msg));

	msg.msg_iov = iov;

	msg.msg_iovlen = 1;
	iov[0].iov_base = hdr;
	iov[0].iov_len = sizeof(*hdr);

	if (wlen) {
		msg.msg_iovlen++;
		iov[1].iov_base = data;
		iov[1].iov_len = wlen;
	}

	ret = do_write(sockfd, &msg, sizeof(*hdr) + wlen, need_retry, epoch);
	if (ret) {
		sd_eprintf("failed to send request %x, %d: %m", hdr->opcode,
			   wlen);
		ret = -1;
	}

	return ret;
}
Beispiel #6
0
static int do_write(int sockfd, struct msghdr *msg, int len,
		    bool (*need_retry)(uint32_t), uint32_t epoch)
{
	int ret, repeat = MAX_RETRY_COUNT;
rewrite:
	ret = sendmsg(sockfd, msg, 0);
	if (ret < 0) {
		if (errno == EINTR)
			goto rewrite;
		/*
		 * Since we set timeout for write, we'll get EAGAIN even for
		 * blocking sockfd.
		 */
		if (errno == EAGAIN && repeat &&
		    (need_retry == NULL || need_retry(epoch))) {
			repeat--;
			goto rewrite;
		}

		sd_eprintf("failed to write to socket: %m");
		return 1;
	}

	len -= ret;
	if (len) {
		forward_iov(msg, ret);
		goto rewrite;
	}

	return 0;
}
Beispiel #7
0
int set_nonblocking(int fd)
{
	int ret;

	ret = fcntl(fd, F_GETFL);
	if (ret < 0) {
		sd_eprintf("fcntl F_GETFL failed: %m");
		close(fd);
	} else {
		ret = fcntl(fd, F_SETFL, ret | O_NONBLOCK);
		if (ret < 0)
			sd_eprintf("fcntl O_NONBLOCK failed: %m");
	}

	return ret;
}
Beispiel #8
0
static int init_obj_path(const char *base_path, char *argp)
{
	char *p;
	int len;

	if (check_path_len(base_path) < 0)
		return -1;

#define OBJ_PATH "/obj"
	len = strlen(base_path) + strlen(OBJ_PATH) + 1;
	obj_path = xzalloc(len);
	snprintf(obj_path, len, "%s" OBJ_PATH, base_path);

	/* Eat up the first component */
	strtok(argp, ",");
	p = strtok(NULL, ",");
	if (!p) {
		/*
		 * If We have only one path, meta-store and object-store share
		 * it. This is helpful to upgrade old sheep cluster to
		 * the MD-enabled.
		 */
		md_add_disk(obj_path);
	} else {
		do {
			if (is_meta_store(p)) {
				sd_eprintf("%s is meta-store, abort", p);
				return -1;
			}
			md_add_disk(p);
		} while ((p = strtok(NULL, ",")));
	}
	return xmkdir(obj_path, sd_def_dmode);
}
Beispiel #9
0
int sheep_exec_req(const struct node_id *nid, struct sd_req *hdr, void *buf)
{
	struct sd_rsp *rsp = (struct sd_rsp *)hdr;
	struct sockfd *sfd;
	int ret;

	assert(is_worker_thread());

	sfd = sockfd_cache_get(nid);
	if (!sfd)
		return SD_RES_NETWORK_ERROR;

	ret = exec_req(sfd->fd, hdr, buf, sheep_need_retry, hdr->epoch,
		       MAX_RETRY_COUNT);
	if (ret) {
		sd_dprintf("remote node might have gone away");
		sockfd_cache_del(nid, sfd);
		return SD_RES_NETWORK_ERROR;
	}
	ret = rsp->result;
	if (ret != SD_RES_SUCCESS)
		sd_eprintf("failed %s", sd_strerror(ret));

	sockfd_cache_put(nid, sfd);
	return ret;
}
Beispiel #10
0
static int do_shepherd_join(void)
{
	int ret, msg_join_len;
	struct sph_msg msg;
	struct sph_msg_join *msg_join;

	msg_join_len = sizeof(struct sph_msg_join) + kept_opaque_len;

	memset(&msg, 0, sizeof(msg));
	msg.type = SPH_CLI_MSG_JOIN;
	msg.body_len = msg_join_len;

	msg_join = xzalloc(msg_join_len);
	msg_join->node = this_node;
	memcpy(msg_join->opaque, kept_opaque, kept_opaque_len);

	ret = writev2(sph_comm_fd, &msg, msg_join, msg_join_len);
	if (sizeof(msg) + msg_join_len != ret) {
		sd_eprintf("do_shepherd_join() failed, %m");
		free(msg_join);

		return -1;
	}

	free(msg_join);
	return 0;
}
Beispiel #11
0
/*
 * Recover the object from its track in epoch history. That is,
 * the routine will try to recovery it from the nodes it has stayed,
 * at least, *theoretically* on consistent hash ring.
 */
static int do_recover_object(struct recovery_work *rw)
{
	struct vnode_info *old;
	uint64_t oid = rw->oids[rw->done];
	uint32_t epoch = rw->epoch, tgt_epoch = rw->epoch - 1;
	int nr_copies, ret, i;

	old = grab_vnode_info(rw->old_vinfo);

again:
	sd_dprintf("try recover object %"PRIx64" from epoch %"PRIu32, oid,
		   tgt_epoch);

	/* Let's do a breadth-first search */
	nr_copies = get_obj_copy_number(oid, old->nr_zones);
	for (i = 0; i < nr_copies; i++) {
		const struct sd_vnode *tgt_vnode;

		tgt_vnode = oid_to_vnode(old->vnodes, old->nr_vnodes, oid, i);

		if (is_invalid_vnode(tgt_vnode, rw->cur_vinfo->nodes,
				     rw->cur_vinfo->nr_nodes))
			continue;
		ret = recover_object_from_replica(oid, tgt_vnode,
						  epoch, tgt_epoch);
		if (ret == SD_RES_SUCCESS) {
			/* Succeed */
			break;
		} else if (SD_RES_OLD_NODE_VER == ret) {
			rw->stop = true;
			goto err;
		} else
			ret = -1;
	}

	/* No luck, roll back to an older configuration and try again */
	if (ret < 0) {
		struct vnode_info *new_old;

rollback:
		tgt_epoch--;
		if (tgt_epoch < 1) {
			sd_eprintf("can not recover oid %"PRIx64, oid);
			ret = -1;
			goto err;
		}

		new_old = get_vnode_info_epoch(tgt_epoch);
		if (!new_old)
			/* We rollback in case we don't get a valid epoch */
			goto rollback;

		put_vnode_info(old);
		old = new_old;
		goto again;
	}
err:
	put_vnode_info(old);
	return ret;
}
Beispiel #12
0
static struct vdi_op_message *prepare_cluster_msg(struct request *req,
		size_t *sizep)
{
	struct vdi_op_message *msg;
	size_t size;

	if (has_process_main(req->op) && req->rq.flags & SD_FLAG_CMD_WRITE)
		size = sizeof(*msg) + req->rq.data_length;
	else
		size = sizeof(*msg);

	assert(size <= SD_MAX_EVENT_BUF_SIZE);

	msg = zalloc(size);
	if (!msg) {
		sd_eprintf("failed to allocate memory\n");
		return NULL;
	}

	memcpy(&msg->req, &req->rq, sizeof(struct sd_req));
	memcpy(&msg->rsp, &req->rp, sizeof(struct sd_rsp));

	if (has_process_main(req->op) && req->rq.flags & SD_FLAG_CMD_WRITE)
		memcpy(msg->data, req->data, req->rq.data_length);

	*sizep = size;
	return msg;
}
Beispiel #13
0
static int create_journal_file(const char *root, const char *name)
{
	int fd, flags = O_DSYNC | O_RDWR | O_TRUNC | O_CREAT | O_DIRECT;
	char path[PATH_MAX];

	snprintf(path, sizeof(path), "%s/%s", root, name);
	fd = open(path, flags, 0644);
	if (fd < 0) {
		sd_eprintf("open %s %m", name);
		return -1;
	}
	if (prealloc(fd, jfile_size) < 0) {
		sd_eprintf("prealloc %s %m", name);
		return -1;
	}

	return fd;
}
Beispiel #14
0
static int restore_objects_from_snap(uint32_t epoch)
{
	struct sha1_file_hdr hdr;
	struct trunk_entry *trunk_buf, *trunk_free = NULL;
	unsigned char trunk_sha1[SHA1_LEN];
	uint64_t nr_trunks, i;
	int ret = SD_RES_EIO;

	if (get_trunk_sha1(epoch, trunk_sha1) < 0)
		goto out;

	trunk_free = trunk_buf = trunk_file_read(trunk_sha1, &hdr);
	if (!trunk_buf)
		goto out;

	nr_trunks = hdr.priv;
	ret = SD_RES_SUCCESS;
	for (i = 0; i < nr_trunks; i++, trunk_buf++) {
		struct sha1_file_hdr h;
		struct siocb io = { 0 };
		uint64_t oid;
		void *buffer = NULL;

		oid = trunk_buf->oid;
		buffer = sha1_file_read(trunk_buf->sha1, &h);
		if (!buffer) {
			sd_eprintf("oid %"PRIx64" not restored", oid);
			goto out;
		}
		io.length = h.size;
		io.buf = buffer;
		ret = default_create_and_write(oid, &io);
		if (ret != SD_RES_SUCCESS) {
			sd_eprintf("oid %"PRIx64" not restored", oid);
			goto out;
		} else
			sd_dprintf("oid %"PRIx64" restored", oid);

		free(buffer);
	}
out:
	free(trunk_free);
	return ret;
}
Beispiel #15
0
int default_cleanup(void)
{
	rmdir_r(stale_dir);
	if (mkdir(stale_dir, 0755) < 0) {
		sd_eprintf("%m\n");
		return SD_RES_EIO;
	}

	return SD_RES_SUCCESS;
}
Beispiel #16
0
static void read_msg(struct sph_msg *rcv)
{
	int ret;

	ret = xread(sph_comm_fd, rcv, sizeof(*rcv));
	if (ret != sizeof(*rcv)) {
		sd_eprintf("xread() failed: %m");
		exit(1);
	}
}
Beispiel #17
0
static inline int check_path_len(const char *path)
{
	int len = strlen(path);
	if (len > PATH_MAX) {
		sd_eprintf("insanely long object directory %s", path);
		return -1;
	}

	return 0;
}
Beispiel #18
0
static void recover_object_work(struct work *work)
{
	struct recovery_work *rw = container_of(work, struct recovery_work,
						work);
	uint64_t oid = rw->oids[rw->done];
	int ret;

	sd_eprintf("done:%"PRIu32" count:%"PRIu32", oid:%"PRIx64, rw->done,
		   rw->count, oid);

	if (sd_store->exist(oid)) {
		sd_dprintf("the object is already recovered");
		return;
	}

	ret = do_recover_object(rw);
	if (ret < 0)
		sd_eprintf("failed to recover object %"PRIx64, oid);
}
Beispiel #19
0
static inline bool md_access(char *path)
{
	if (access(path, R_OK | W_OK) < 0) {
		if (errno != ENOENT)
			sd_eprintf("failed to check %s, %m", path);
		return false;
	}

	return true;
}
Beispiel #20
0
int get_local_addr(uint8_t *bytes)
{
	struct ifaddrs *ifaddr, *ifa;
	int ret = 0;

	if (getifaddrs(&ifaddr) == -1) {
		sd_eprintf("getifaddrs failed: %m");
		return -1;
	}


	for (ifa = ifaddr; ifa; ifa = ifa->ifa_next) {
		struct sockaddr_in *sin;
		struct sockaddr_in6 *sin6;

		if (ifa->ifa_flags & IFF_LOOPBACK)
			continue;
		if (!ifa->ifa_addr)
			continue;

		switch (ifa->ifa_addr->sa_family) {
		case AF_INET:
			sin = (struct sockaddr_in *)ifa->ifa_addr;
			memset(bytes, 0, 12);
			memcpy(bytes + 12, &sin->sin_addr, 4);
			memcpy(bytes + 12, &sin->sin_addr, 4);
			sd_eprintf("found IPv4 address");
			goto out;
		case AF_INET6:
			sin6 = (struct sockaddr_in6 *)ifa->ifa_addr;
			memcpy(bytes, &sin6->sin6_addr, 16);
			sd_eprintf("found IPv6 address");
			goto out;
		}
	}

	sd_eprintf("no valid interface found");
	ret = -1;
out:
	freeifaddrs(ifaddr);
	return ret;
}
Beispiel #21
0
static int make_stale_dir(char *path)
{
	char p[PATH_MAX];

	snprintf(p, PATH_MAX, "%s/.stale", path);
	if (xmkdir(p, def_dmode) < 0) {
		sd_eprintf("%s failed, %m", p);
		return SD_RES_EIO;
	}
	return SD_RES_SUCCESS;
}
Beispiel #22
0
int default_flush(void)
{
	int fd, ret = SD_RES_SUCCESS;

	fd = open(obj_path, O_RDONLY);
	if (fd < 0) {
		sd_eprintf("error at open() %s, %s\n",
			obj_path, strerror(errno));
		return SD_RES_NO_OBJ;
	}

	if (syncfs(fd)) {
		sd_eprintf("error at syncfs(), %s\n", strerror(errno));
		ret = SD_RES_EIO;
	}

	close(fd);

	return ret;
}
Beispiel #23
0
int default_format(void)
{
	unsigned ret;

	sd_dprintf("try get a clean store\n");
	ret = rmdir_r(obj_path);
	if (ret && ret != -ENOENT) {
		sd_eprintf("failed to remove %s: %s\n",
			obj_path, strerror(-ret));
		return SD_RES_EIO;
	}
	if (mkdir(obj_path, def_dmode) < 0) {
		sd_eprintf("%m\n");
		return SD_RES_EIO;
	}
	if (is_object_cache_enabled())
		object_cache_format();

	return SD_RES_SUCCESS;
}
Beispiel #24
0
int default_write(uint64_t oid, const struct siocb *iocb)
{
	int flags = get_open_flags(oid, false, iocb->flags), fd,
	    ret = SD_RES_SUCCESS;
	char path[PATH_MAX];
	ssize_t size;

	if (iocb->epoch < sys_epoch()) {
		sd_dprintf("%"PRIu32" sys %"PRIu32"\n",
			iocb->epoch, sys_epoch());
		return SD_RES_OLD_NODE_VER;
	}

	get_obj_path(oid, path);

	if (uatomic_is_true(&sys->use_journal) &&
	    journal_file_write(oid, iocb->buf, iocb->length, iocb->offset,
			       false)
	    != SD_RES_SUCCESS) {
		sd_eprintf("turn off journaling\n");
		uatomic_set_false(&sys->use_journal);
		flags |= O_DSYNC;
		sync();
	}

	fd = open(path, flags, def_fmode);
	if (fd < 0)
		return err_to_sderr(oid, errno);

	size = xpwrite(fd, iocb->buf, iocb->length, iocb->offset);
	if (size != iocb->length) {
		sd_eprintf("failed to write object %"PRIx64", path=%s, offset=%"
			PRId64", size=%"PRId32", result=%zd, %m\n", oid, path,
			iocb->offset, iocb->length, size);
		ret = err_to_sderr(oid, errno);
		goto out;
	}
out:
	close(fd);
	return ret;
}
Beispiel #25
0
int create_unix_domain_socket(const char *unix_path,
			      int (*callback)(int, void *), void *data)
{
	int fd, ret;
	struct sockaddr_un addr;

	addr.sun_family = AF_UNIX;
	pstrcpy(addr.sun_path, sizeof(addr.sun_path), unix_path);

	fd = socket(addr.sun_family, SOCK_STREAM, 0);
	if (fd < 0) {
		sd_eprintf("failed to create socket, %m");
		return -1;
	}

	ret = bind(fd, &addr, sizeof(addr));
	if (ret) {
		sd_eprintf("failed to bind socket: %m");
		goto err;
	}

	ret = listen(fd, SOMAXCONN);
	if (ret) {
		sd_eprintf("failed to listen on socket: %m");
		goto err;
	}

	ret = set_nonblocking(fd);
	if (ret < 0)
		goto err;

	ret = callback(fd, data);
	if (ret)
		goto err;

	return 0;
err:
	close(fd);

	return -1;
}
Beispiel #26
0
static size_t get_file_size(const char *path)
{
	struct stat stbuf;
	int ret;

	ret = stat(path, &stbuf);
	if (ret < 0) {
		sd_eprintf("failed to stat %s, %m", path);
		return -1;
	}
	return stbuf.st_size;
}
Beispiel #27
0
bool inetaddr_is_valid(char *addr)
{
	unsigned char buf[INET6_ADDRSTRLEN];
	int af;

	af = strstr(addr, ":") ? AF_INET6 : AF_INET;
	if (!inet_pton(af, addr, buf)) {
		sd_eprintf("Bad address '%s'", addr);
		return false;
	}
	return true;
}
Beispiel #28
0
int err_to_sderr(uint64_t oid, int err)
{
	struct stat s;

	switch (err) {
	case ENOENT:
		if (stat(get_object_path(oid), &s) < 0) {
			sd_eprintf("corrupted");
			return SD_RES_EIO;
		}
		sd_dprintf("object %016" PRIx64 " not found locally", oid);
		return SD_RES_NO_OBJ;
	case ENOSPC:
		/* TODO: stop automatic recovery */
		sd_eprintf("diskfull, oid=%"PRIx64, oid);
		return SD_RES_NO_SPACE;
	default:
		sd_eprintf("oid=%"PRIx64", %m", oid);
		return SD_RES_EIO;
	}
}
Beispiel #29
0
static int jrnl_create(struct jrnl_descriptor *jd, const char *jrnl_dir)
{
	snprintf(jd->path, sizeof(jd->path), "%sXXXXXX", jrnl_dir);
	jd->fd = mkostemp(jd->path, O_DSYNC);

	if (jd->fd < 0) {
		sd_eprintf("failed to create %s: %m", jd->path);
		return SD_RES_UNKNOWN;
	}

	return SD_RES_SUCCESS;
}
Beispiel #30
0
static int write_config(void)
{
	int ret;

	ret = atomic_create_and_write(config_path, (char *)&config,
				sizeof(config));
	if (ret < 0) {
		sd_eprintf("atomic_create_and_write() failed");
		return SD_RES_EIO;
	}

	return SD_RES_SUCCESS;
}