Exemplo n.º 1
0
static inline void prepare_schedule_oid(uint64_t oid)
{
	struct recovery_work *rw = recovering_work;
	int i;

	for (i = 0; i < rw->nr_prio_oids; i++)
		if (rw->prio_oids[i] == oid)
			return;
	/*
	 * We need this check because oid might not be recovered.
	 * Very much unlikely though, but it might happen indeed.
	 */
	for (i = 0; i < rw->done; i++)
		if (rw->oids[i] == oid) {
			sd_dprintf("%"PRIx64" not recovered, don't schedule it",
				   oid);
			return;
		}
	/* When auto recovery is enabled, the oid is currently being
	 * recovered */
	if (!sys->disable_recovery && rw->oids[rw->done] == oid)
		return;
	rw->nr_prio_oids++;
	rw->prio_oids = xrealloc(rw->prio_oids,
				 rw->nr_prio_oids * sizeof(uint64_t));
	rw->prio_oids[rw->nr_prio_oids - 1] = oid;
	resume_suspended_recovery();

	sd_dprintf("%"PRIx64" nr_prio_oids %d", oid, rw->nr_prio_oids);
}
Exemplo n.º 2
0
/*
 * Timeout after request is issued after 5s.
 *
 * Heart-beat message will be sent periodically with 1s interval.
 * If the node of the other end of fd fails, we'll detect it in 3s
 */
int set_keepalive(int fd)
{
	int val = 1;

	if (setsockopt(fd, SOL_SOCKET, SO_KEEPALIVE, &val, sizeof(val)) < 0) {
		sd_dprintf("%m");
		return -1;
	}
	val = 5;
	if (setsockopt(fd, SOL_TCP, TCP_KEEPIDLE, &val, sizeof(val)) < 0) {
		sd_dprintf("%m");
		return -1;
	}
	val = 1;
	if (setsockopt(fd, SOL_TCP, TCP_KEEPINTVL, &val, sizeof(val)) < 0) {
		sd_dprintf("%m");
		return -1;
	}
	val = 3;
	if (setsockopt(fd, SOL_TCP, TCP_KEEPCNT, &val, sizeof(val)) < 0) {
		sd_dprintf("%m");
		return -1;
	}
	return 0;
}
Exemplo n.º 3
0
int update_epoch_log(uint32_t epoch, struct sd_node *nodes, size_t nr_nodes)
{
	int fd, ret, len;
	time_t t;
	char path[PATH_MAX];

	sd_dprintf("update epoch: %d, %zd", epoch, nr_nodes);

	snprintf(path, sizeof(path), "%s%08u", epoch_path, epoch);
	fd = open(path, O_RDWR | O_CREAT | O_DSYNC, def_fmode);
	if (fd < 0) {
		ret = fd;
		goto err_open;
	}

	len = nr_nodes * sizeof(struct sd_node);
	ret = xwrite(fd, (char *)nodes, len);
	if (ret != len)
		goto err;

	/* Piggyback the epoch creation time for 'collie cluster info' */
	time(&t);
	len = sizeof(t);
	ret = xwrite(fd, (char *)&t, len);
	if (ret != len)
		goto err;

	close(fd);
	return 0;
err:
	close(fd);
err_open:
	sd_dprintf("%m");
	return -1;
}
Exemplo n.º 4
0
void wait_get_vdis_done(void)
{
	sd_dprintf("waiting for vdi list\n");

	pthread_mutex_lock(&wait_vdis_lock);
	while (!is_vdi_list_ready)
		pthread_cond_wait(&wait_vdis_cond, &wait_vdis_lock);
	pthread_mutex_unlock(&wait_vdis_lock);

	sd_dprintf("vdi list ready\n");
}
Exemplo n.º 5
0
static void add_event(enum local_event_type type, struct local_node *lnode,
		void *buf, size_t buf_len)
{
	int idx, i;
	struct local_node *n;
	struct local_event ev = {
		.type = type,
		.sender = *lnode,
	};

	ev.buf_len = buf_len;
	if (buf)
		memcpy(ev.buf, buf, buf_len);

	ev.nr_lnodes = get_nodes(ev.lnodes);

	switch (type) {
	case EVENT_JOIN_REQUEST:
		ev.lnodes[ev.nr_lnodes] = *lnode;
		ev.nr_lnodes++;
		break;
	case EVENT_LEAVE:
		n = find_lnode(lnode, ev.nr_lnodes, ev.lnodes);
		idx = n - ev.lnodes;

		ev.nr_lnodes--;
		memmove(n, n + 1, sizeof(*n) * (ev.nr_lnodes - idx));
		break;
	case EVENT_GATEWAY:
		n = find_lnode(lnode, ev.nr_lnodes, ev.lnodes);
		n->gateway = true;
		break;
	case EVENT_NOTIFY:
	case EVENT_BLOCK:
		break;
	case EVENT_UPDATE_NODE:
		n = find_lnode(lnode, ev.nr_lnodes, ev.lnodes);
		n->node = lnode->node;
		break;
	case EVENT_JOIN_RESPONSE:
		abort();
	}

	sd_dprintf("type = %d, sender = %s", ev.type, lnode_to_str(&ev.sender));
	for (i = 0; i < ev.nr_lnodes; i++)
		sd_dprintf("%d: %s", i, lnode_to_str(ev.lnodes + i));

	shm_queue_push(&ev);

	shm_queue_notify();
}
Exemplo n.º 6
0
static int add_event(enum local_event_type type, struct local_node *lnode,
		      void *buf, size_t buf_len)
{
	struct local_node *n;
	struct local_event ev = {
		.type = type,
		.sender = *lnode,
	};

	ev.buf_len = buf_len;
	if (buf)
		memcpy(ev.buf, buf, buf_len);

	ev.nr_lnodes = get_nodes(ev.lnodes);

	switch (type) {
	case EVENT_JOIN:
		ev.lnodes[ev.nr_lnodes] = *lnode;
		ev.nr_lnodes++;
		break;
	case EVENT_LEAVE:
		xlremove(lnode, ev.lnodes, &ev.nr_lnodes, lnode_cmp);
		break;
	case EVENT_GATEWAY:
		n = xlfind(lnode, ev.lnodes, ev.nr_lnodes, lnode_cmp);
		n->gateway = true;
		break;
	case EVENT_NOTIFY:
	case EVENT_BLOCK:
		break;
	case EVENT_UPDATE_NODE:
		n = xlfind(lnode, ev.lnodes, ev.nr_lnodes, lnode_cmp);
		n->node = lnode->node;
		break;
	case EVENT_ACCEPT:
		abort();
	}

	sd_dprintf("type = %d, sender = %s", ev.type, lnode_to_str(&ev.sender));
	for (int i = 0; i < ev.nr_lnodes; i++)
		sd_dprintf("%d: %s", i, lnode_to_str(ev.lnodes + i));

	shm_queue_push(&ev);

	shm_queue_notify();

	return SD_RES_SUCCESS;
}
Exemplo n.º 7
0
bool have_enough_zones(void)
{
	int max_copies;

	if (sys->flags & SD_FLAG_NOHALT)
		return true;

	if (!current_vnode_info)
		return false;

	max_copies = get_max_copy_number();

	sd_dprintf("flags %d, nr_zones %d, min copies %d\n",
		sys->flags, current_vnode_info->nr_zones, max_copies);

	if (!current_vnode_info->nr_zones)
		return false;

	if (sys->flags & SD_FLAG_QUORUM) {
		if (current_vnode_info->nr_zones > (max_copies/2))
			return true;
	} else {
		if (current_vnode_info->nr_zones >= max_copies)
			return true;
	}
	return false;
}
Exemplo n.º 8
0
void recalculate_vnodes(struct sd_node *nodes, int nr_nodes)
{
	int i, nr_non_gateway_nodes = 0;
	uint64_t avg_size = 0;
	float factor;

	for (i = 0; i < nr_nodes; i++) {
		if (nodes[i].space) {
			avg_size += nodes[i].space;
			nr_non_gateway_nodes++;
		}
	}

	if (!nr_non_gateway_nodes)
		return;

	avg_size /= nr_non_gateway_nodes;

	for (i = 0; i < nr_nodes; i++) {
		factor = (float)nodes[i].space / (float)avg_size;
		nodes[i].nr_vnodes = rintf(SD_DEFAULT_VNODES * factor);
		sd_dprintf("node %d has %d vnodes, free space %" PRIu64 "\n",
			nodes[i].nid.port, nodes[i].nr_vnodes, nodes[i].space);
	}
}
Exemplo n.º 9
0
static void recover_object_main(struct work *work)
{
	struct recovery_work *rw = container_of(work, struct recovery_work,
						work);
	if (next_rw) {
		run_next_rw(rw);
		return;
	}

	if (rw->stop) {
		/*
		 * Stop this recovery process and wait for epoch to be
		 * lifted and flush wait_obj queue to requeue those
		 * requests
		 */
		flush_wait_obj_requests();
		sd_dprintf("recovery is stopped");
		return;
	}

	resume_wait_obj_requests(rw->oids[rw->done++]);

	if (rw->done < rw->count) {
		recover_next_object(rw);
		return;
	}

	finish_recovery(rw);
}
Exemplo n.º 10
0
static int get_trunk_sha1(uint32_t epoch, unsigned char *outsha1)
{
	int i, nr_logs = -1, ret = -1;
	struct snap_log *log_buf, *log_free = NULL;
	void *snap_buf = NULL;
	struct sha1_file_hdr hdr;

	log_free = log_buf = snap_log_read(&nr_logs);
	sd_dprintf("%d", nr_logs);
	if (nr_logs < 0)
		goto out;

	for (i = 0; i < nr_logs; i++, log_buf++) {
		if (log_buf->epoch != epoch)
			continue;
		snap_buf = snap_file_read(log_buf->sha1, &hdr);
		if (!snap_buf)
			goto out;
		memcpy(outsha1, snap_buf, SHA1_LEN);
		ret = 0;
		break;
	}
out:
	free(log_free);
	free(snap_buf);
	return ret;
}
Exemplo n.º 11
0
/*
 * Recover the object from its track in epoch history. That is,
 * the routine will try to recovery it from the nodes it has stayed,
 * at least, *theoretically* on consistent hash ring.
 */
static int do_recover_object(struct recovery_work *rw)
{
	struct vnode_info *old;
	uint64_t oid = rw->oids[rw->done];
	uint32_t epoch = rw->epoch, tgt_epoch = rw->epoch - 1;
	int nr_copies, ret, i;

	old = grab_vnode_info(rw->old_vinfo);

again:
	sd_dprintf("try recover object %"PRIx64" from epoch %"PRIu32, oid,
		   tgt_epoch);

	/* Let's do a breadth-first search */
	nr_copies = get_obj_copy_number(oid, old->nr_zones);
	for (i = 0; i < nr_copies; i++) {
		const struct sd_vnode *tgt_vnode;

		tgt_vnode = oid_to_vnode(old->vnodes, old->nr_vnodes, oid, i);

		if (is_invalid_vnode(tgt_vnode, rw->cur_vinfo->nodes,
				     rw->cur_vinfo->nr_nodes))
			continue;
		ret = recover_object_from_replica(oid, tgt_vnode,
						  epoch, tgt_epoch);
		if (ret == SD_RES_SUCCESS) {
			/* Succeed */
			break;
		} else if (SD_RES_OLD_NODE_VER == ret) {
			rw->stop = true;
			goto err;
		} else
			ret = -1;
	}

	/* No luck, roll back to an older configuration and try again */
	if (ret < 0) {
		struct vnode_info *new_old;

rollback:
		tgt_epoch--;
		if (tgt_epoch < 1) {
			sd_eprintf("can not recover oid %"PRIx64, oid);
			ret = -1;
			goto err;
		}

		new_old = get_vnode_info_epoch(tgt_epoch);
		if (!new_old)
			/* We rollback in case we don't get a valid epoch */
			goto rollback;

		put_vnode_info(old);
		old = new_old;
		goto again;
	}
err:
	put_vnode_info(old);
	return ret;
}
Exemplo n.º 12
0
static int farm_snapshot(const struct siocb *iocb)
{
	unsigned char snap_sha1[SHA1_LEN];
	unsigned char trunk_sha1[SHA1_LEN];
	struct sd_node nodes[SD_MAX_NODES];
	int nr_nodes;
	void *buffer;
	int log_nr, ret = SD_RES_EIO, epoch;

	buffer = snap_log_read(&log_nr);
	if (!buffer)
		goto out;

	epoch = log_nr + 1;
	sd_dprintf("user epoch %d", epoch);

	nr_nodes = epoch_log_read(sys->epoch, nodes, sizeof(nodes));
	if (nr_nodes < 0)
		goto out;

	if (trunk_file_write(trunk_sha1) < 0)
		goto out;

	if (snap_file_write(sys->epoch, nodes, nr_nodes,
			    trunk_sha1, snap_sha1) < 0)
		goto out;

	if (snap_log_write(epoch, snap_sha1) < 0)
		goto out;

	ret = SD_RES_SUCCESS;
out:
	free(buffer);
	return ret;
}
Exemplo n.º 13
0
int sheep_exec_req(const struct node_id *nid, struct sd_req *hdr, void *buf)
{
	struct sd_rsp *rsp = (struct sd_rsp *)hdr;
	struct sockfd *sfd;
	int ret;

	assert(is_worker_thread());

	sfd = sockfd_cache_get(nid);
	if (!sfd)
		return SD_RES_NETWORK_ERROR;

	ret = exec_req(sfd->fd, hdr, buf, sheep_need_retry, hdr->epoch,
		       MAX_RETRY_COUNT);
	if (ret) {
		sd_dprintf("remote node might have gone away");
		sockfd_cache_del(nid, sfd);
		return SD_RES_NETWORK_ERROR;
	}
	ret = rsp->result;
	if (ret != SD_RES_SUCCESS)
		sd_eprintf("failed %s", sd_strerror(ret));

	sockfd_cache_put(nid, sfd);
	return ret;
}
Exemplo n.º 14
0
/*
 * Schedule prio_oids to be recovered first in FIFO order
 *
 * rw->done is index of the original next object to be recovered and also the
 * number of objects already recovered.
 * we just move rw->prio_oids in between:
 *   new_oids = [0..rw->done - 1] + [rw->prio_oids] + [rw->done]
 */
static inline void finish_schedule_oids(struct recovery_work *rw)
{
	int i, nr_recovered = rw->done, new_idx;
	uint64_t *new_oids;

	/* If I am the last oid, done */
	if (nr_recovered == rw->count - 1)
		goto done;

	new_oids = xmalloc(list_buffer_size);
	memcpy(new_oids, rw->oids, nr_recovered * sizeof(uint64_t));
	memcpy(new_oids + nr_recovered, rw->prio_oids,
	       rw->nr_prio_oids * sizeof(uint64_t));
	new_idx = nr_recovered + rw->nr_prio_oids;

	for (i = rw->done; i < rw->count; i++) {
		if (oid_in_prio_oids(rw, rw->oids[i]))
			continue;
		new_oids[new_idx++] = rw->oids[i];
	}
	/* rw->count should eq new_idx, otherwise something is wrong */
	sd_dprintf("%snr_recovered %d, nr_prio_oids %d, count %d = new %d",
		   rw->count == new_idx ? "" : "WARN: ", nr_recovered,
		   rw->nr_prio_oids, rw->count, new_idx);

	free(rw->oids);
	rw->oids = new_oids;
done:
	free(rw->prio_oids);
	rw->prio_oids = NULL;
	rw->nr_scheduled_prio_oids += rw->nr_prio_oids;
	rw->nr_prio_oids = 0;
}
Exemplo n.º 15
0
static inline void run_next_rw(struct recovery_work *rw)
{
	free_recovery_work(rw);
	rw = next_rw;
	next_rw = NULL;
	recovering_work = rw;
	flush_wait_obj_requests();
	queue_work(sys->recovery_wqueue, &rw->work);
	sd_dprintf("recovery work is superseded");
}
Exemplo n.º 16
0
int init_disk_space(const char *base_path)
{
	int ret = SD_RES_SUCCESS;
	uint64_t space_size = 0, mds;
	struct statvfs fs;

	if (sys->gateway_only)
		goto out;

	/* We need to init md even we don't need to update sapce */
	mds = md_init_space();

	/* If it is restarted */
	ret = get_node_space(&space_size);
	if (space_size != 0) {
		sys->disk_space = space_size;
		goto out;
	}

	/* User has specified the space at startup */
	if (sys->disk_space) {
		ret = set_node_space(sys->disk_space);
		goto out;
	}

	if (mds) {
		sys->disk_space = mds;
	} else {
		ret = statvfs(base_path, &fs);
		if (ret < 0) {
			sd_dprintf("get disk space failed %m");
			ret = SD_RES_EIO;
			goto out;
		}
		sys->disk_space = (uint64_t)fs.f_frsize * fs.f_bfree;
	}

	ret = set_node_space(sys->disk_space);
out:
	sd_dprintf("disk free space is %" PRIu64, sys->disk_space);
	return ret;
}
Exemplo n.º 17
0
int default_init(void)
{
	int ret;

	sd_dprintf("use plain store driver");
	ret = for_each_obj_path(make_stale_dir);
	if (ret != SD_RES_SUCCESS)
		return ret;

	return for_each_object_in_wd(init_objlist_and_vdi_bitmap, true, NULL);
}
Exemplo n.º 18
0
static int post_cluster_new_vdi(const struct sd_req *req, struct sd_rsp *rsp,
				void *data)
{
	unsigned long nr = rsp->vdi.vdi_id;
	int ret = rsp->result;

	sd_dprintf("done %d %ld\n", ret, nr);
	if (ret == SD_RES_SUCCESS)
		set_bit(nr, sys->vdi_inuse);

	return ret;
}
Exemplo n.º 19
0
/* Prepare the object list that belongs to this node */
static void prepare_object_list(struct work *work)
{
	struct recovery_work *rw = container_of(work, struct recovery_work,
						work);
	struct sd_node *cur = rw->cur_vinfo->nodes;
	int cur_nr = rw->cur_vinfo->nr_nodes;
	int start = random() % cur_nr, i, end = cur_nr;
	uint64_t *oids;

	sd_dprintf("%u", rw->epoch);
	wait_get_vdis_done();
again:
	/* We need to start at random node for better load balance */
	for (i = start; i < end; i++) {
		size_t nr_oids;
		struct sd_node *node = cur + i;

		if (next_rw) {
			sd_dprintf("go to the next recovery");
			return;
		}
		if (newly_joined(node, rw))
			/* new node doesn't have a list file */
			continue;

		oids = fetch_object_list(node, rw->epoch, &nr_oids);
		if (!oids)
			continue;
		screen_object_list(rw, oids, nr_oids);
		free(oids);
	}

	if (start != 0) {
		end = start;
		start = 0;
		goto again;
	}

	sd_dprintf("%d", rw->count);
}
Exemplo n.º 20
0
static void shm_queue_notify(void)
{
	int i;
	size_t nr;
	struct local_node lnodes[SD_MAX_NODES];

	nr = get_nodes(lnodes);

	for (i = 0; i < nr; i++) {
		sd_dprintf("send signal to %s", lnode_to_str(lnodes + i));
		kill(lnodes[i].pid, SIGUSR1);
	}
}
Exemplo n.º 21
0
/*
 * If the node is gateway, this function only finds the store driver.
 * Otherwise, this function initializes the backend store
 */
int init_store_driver(bool is_gateway)
{
	char driver_name[STORE_LEN], *p;
	int ret;

	memset(driver_name, '\0', sizeof(driver_name));
	ret = get_cluster_store(driver_name);
	if (ret != SD_RES_SUCCESS)
		return ret;

	p = memchr(driver_name, '\0', STORE_LEN);
	if (!p) {
		/*
		 * If the driver name is not NUL terminated we are in deep
		 * trouble, let's get out here.
		 */
		sd_dprintf("store name not NUL terminated");
		return SD_RES_NO_STORE;
	}

	/*
	 * The store file might not exist in case this is a new sheep that
	 * never joined a cluster before.
	 */
	if (p == driver_name)
		return 0;

	sd_store = find_store_driver(driver_name);
	if (!sd_store) {
		sd_dprintf("store %s not found", driver_name);
		return SD_RES_NO_STORE;
	}

	if (is_gateway)
		return SD_RES_SUCCESS;

	return sd_store->init();
}
Exemplo n.º 22
0
int default_format(void)
{
	unsigned ret;

	sd_dprintf("try get a clean store");
	ret = for_each_obj_path(purge_dir);
	if (ret != SD_RES_SUCCESS)
		return ret;

	if (sys->enable_object_cache)
		object_cache_format();

	return SD_RES_SUCCESS;
}
Exemplo n.º 23
0
static int init_objlist_and_vdi_bitmap(uint64_t oid, char *wd, void *arg)
{
	int ret;
	objlist_cache_insert(oid);

	if (is_vdi_obj(oid)) {
		sd_dprintf("found the VDI object %" PRIx64, oid);
		set_bit(oid_to_vid(oid), sys->vdi_inuse);
		ret = init_vdi_copy_number(oid, wd);
		if (ret != SD_RES_SUCCESS)
			return ret;
	}
	return SD_RES_SUCCESS;
}
Exemplo n.º 24
0
static int err_to_sderr(char *path, uint64_t oid, int err)
{
    struct stat s;
    char *dir = dirname(path);

    sd_dprintf("%s", dir);
    switch (err) {
    case ENOENT:
        if (stat(dir, &s) < 0) {
            sd_eprintf("%s corrupted", dir);
            return md_handle_eio(dir);
        }
        sd_dprintf("object %016" PRIx64 " not found locally", oid);
        return SD_RES_NO_OBJ;
    case ENOSPC:
        /* TODO: stop automatic recovery */
        sd_eprintf("diskfull, oid=%"PRIx64, oid);
        return SD_RES_NO_SPACE;
    default:
        sd_eprintf("oid=%"PRIx64", %m", oid);
        return md_handle_eio(dir);
    }
}
Exemplo n.º 25
0
static inline void calculate_vdisks(struct disk *disks, int nr_disks,
			     uint64_t total)
{
	uint64_t avg_size = total / nr_disks;
	float factor;
	int i;

	for (i = 0; i < nr_disks; i++) {
		factor = (float)disks[i].space / (float)avg_size;
		md_disks[i].nr_vdisks = rintf(MD_DEFAULT_VDISKS * factor);
		sd_dprintf("%s has %d vdisks, free space %" PRIu64,
			   md_disks[i].path, md_disks[i].nr_vdisks,
			   md_disks[i].space);
	}
}
Exemplo n.º 26
0
int default_init(const char *p)
{
	sd_dprintf("use plain store driver\n");

	/* create a stale directory */
	snprintf(stale_dir, sizeof(stale_dir), "%s/.stale", p);
	if (mkdir(stale_dir, 0755) < 0) {
		if (errno != EEXIST) {
			sd_eprintf("%m\n");
			return SD_RES_EIO;
		}
	}

	return for_each_object_in_wd(init_objlist_and_vdi_bitmap, true, NULL);
}
Exemplo n.º 27
0
/* Fetch the object list from all the nodes in the cluster */
static uint64_t *fetch_object_list(struct sd_node *e, uint32_t epoch,
				   size_t *nr_oids)
{
	char name[128];
	struct sd_list_req hdr;
	struct sd_list_rsp *rsp = (struct sd_list_rsp *)&hdr;
	size_t buf_size = list_buffer_size;
	uint64_t *buf = xmalloc(buf_size);
	int ret;

	addr_to_str(name, sizeof(name), e->nid.addr, 0);
	sd_dprintf("%s %"PRIu32, name, e->nid.port);

retry:
	sd_init_req((struct sd_req *)&hdr, SD_OP_GET_OBJ_LIST);
	hdr.tgt_epoch = epoch - 1;
	hdr.data_length = buf_size;
	hdr.epoch = sys_epoch();
	ret = sheep_exec_req(&e->nid, (struct sd_req *)&hdr, buf);

	switch (ret) {
	case SD_RES_SUCCESS:
		break;
	case SD_RES_BUFFER_SMALL:
		buf_size *= 2;
		buf = xrealloc(buf, buf_size);
		goto retry;
	default:
		free(buf);
		return NULL;
	}

	*nr_oids = rsp->data_length / sizeof(uint64_t);
	sd_dprintf("%zu", *nr_oids);
	return buf;
}
Exemplo n.º 28
0
char *get_object_path(uint64_t oid)
{
	struct vdisk *vd;
	char *p;

	if (!sys->enable_md)
		return obj_path;

	pthread_rwlock_rdlock(&md_lock);
	vd = oid_to_vdisk(oid);
	p = md_disks[vd->idx].path;
	pthread_rwlock_unlock(&md_lock);
	sd_dprintf("%d, %s", vd->idx, p);

	return p;
}
Exemplo n.º 29
0
static inline void finish_recovery(struct recovery_work *rw)
{
	recovering_work = NULL;
	sys->recovered_epoch = rw->epoch;

	if (sd_store->end_recover)
		sd_store->end_recover(sys->epoch - 1, rw->old_vinfo);

	/* notify recovery completion to other nodes */
	rw->work.fn = notify_recovery_completion_work;
	rw->work.done = notify_recovery_completion_main;
	queue_work(sys->recovery_wqueue, &rw->work);

	sd_dprintf("recovery complete: new epoch %"PRIu32,
		   sys->recovered_epoch);
}
Exemplo n.º 30
0
static int recover_object_from_replica(uint64_t oid,
				       const struct sd_vnode *vnode,
				       uint32_t epoch, uint32_t tgt_epoch)
{
	struct sd_req hdr;
	struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
	unsigned rlen;
	int ret = SD_RES_NO_MEM;
	void *buf = NULL;
	struct siocb iocb = { 0 };

	if (vnode_is_local(vnode)) {
		ret = sd_store->link(oid, tgt_epoch);
		goto out;
	}

	rlen = get_objsize(oid);
	buf = valloc(rlen);
	if (!buf) {
		sd_eprintf("%m");
		goto out;
	}

	sd_init_req(&hdr, SD_OP_READ_PEER);
	hdr.epoch = epoch;
	hdr.flags = SD_FLAG_CMD_RECOVERY;
	hdr.data_length = rlen;
	hdr.obj.oid = oid;
	hdr.obj.tgt_epoch = tgt_epoch;

	ret = sheep_exec_req(&vnode->nid, &hdr, buf);
	if (ret != SD_RES_SUCCESS)
		goto out;
	iocb.epoch = epoch;
	iocb.length = rsp->data_length;
	iocb.offset = rsp->obj.offset;
	iocb.buf = buf;
	ret = sd_store->create_and_write(oid, &iocb);
out:
	if (ret == SD_RES_SUCCESS) {
		sd_dprintf("recovered oid %"PRIx64" from %d to epoch %d", oid,
			tgt_epoch, epoch);
		objlist_cache_insert(oid);
	}
	free(buf);
	return ret;
}