static inline void prepare_schedule_oid(uint64_t oid) { struct recovery_work *rw = recovering_work; int i; for (i = 0; i < rw->nr_prio_oids; i++) if (rw->prio_oids[i] == oid) return; /* * We need this check because oid might not be recovered. * Very much unlikely though, but it might happen indeed. */ for (i = 0; i < rw->done; i++) if (rw->oids[i] == oid) { sd_dprintf("%"PRIx64" not recovered, don't schedule it", oid); return; } /* When auto recovery is enabled, the oid is currently being * recovered */ if (!sys->disable_recovery && rw->oids[rw->done] == oid) return; rw->nr_prio_oids++; rw->prio_oids = xrealloc(rw->prio_oids, rw->nr_prio_oids * sizeof(uint64_t)); rw->prio_oids[rw->nr_prio_oids - 1] = oid; resume_suspended_recovery(); sd_dprintf("%"PRIx64" nr_prio_oids %d", oid, rw->nr_prio_oids); }
/* * Timeout after request is issued after 5s. * * Heart-beat message will be sent periodically with 1s interval. * If the node of the other end of fd fails, we'll detect it in 3s */ int set_keepalive(int fd) { int val = 1; if (setsockopt(fd, SOL_SOCKET, SO_KEEPALIVE, &val, sizeof(val)) < 0) { sd_dprintf("%m"); return -1; } val = 5; if (setsockopt(fd, SOL_TCP, TCP_KEEPIDLE, &val, sizeof(val)) < 0) { sd_dprintf("%m"); return -1; } val = 1; if (setsockopt(fd, SOL_TCP, TCP_KEEPINTVL, &val, sizeof(val)) < 0) { sd_dprintf("%m"); return -1; } val = 3; if (setsockopt(fd, SOL_TCP, TCP_KEEPCNT, &val, sizeof(val)) < 0) { sd_dprintf("%m"); return -1; } return 0; }
int update_epoch_log(uint32_t epoch, struct sd_node *nodes, size_t nr_nodes) { int fd, ret, len; time_t t; char path[PATH_MAX]; sd_dprintf("update epoch: %d, %zd", epoch, nr_nodes); snprintf(path, sizeof(path), "%s%08u", epoch_path, epoch); fd = open(path, O_RDWR | O_CREAT | O_DSYNC, def_fmode); if (fd < 0) { ret = fd; goto err_open; } len = nr_nodes * sizeof(struct sd_node); ret = xwrite(fd, (char *)nodes, len); if (ret != len) goto err; /* Piggyback the epoch creation time for 'collie cluster info' */ time(&t); len = sizeof(t); ret = xwrite(fd, (char *)&t, len); if (ret != len) goto err; close(fd); return 0; err: close(fd); err_open: sd_dprintf("%m"); return -1; }
void wait_get_vdis_done(void) { sd_dprintf("waiting for vdi list\n"); pthread_mutex_lock(&wait_vdis_lock); while (!is_vdi_list_ready) pthread_cond_wait(&wait_vdis_cond, &wait_vdis_lock); pthread_mutex_unlock(&wait_vdis_lock); sd_dprintf("vdi list ready\n"); }
static void add_event(enum local_event_type type, struct local_node *lnode, void *buf, size_t buf_len) { int idx, i; struct local_node *n; struct local_event ev = { .type = type, .sender = *lnode, }; ev.buf_len = buf_len; if (buf) memcpy(ev.buf, buf, buf_len); ev.nr_lnodes = get_nodes(ev.lnodes); switch (type) { case EVENT_JOIN_REQUEST: ev.lnodes[ev.nr_lnodes] = *lnode; ev.nr_lnodes++; break; case EVENT_LEAVE: n = find_lnode(lnode, ev.nr_lnodes, ev.lnodes); idx = n - ev.lnodes; ev.nr_lnodes--; memmove(n, n + 1, sizeof(*n) * (ev.nr_lnodes - idx)); break; case EVENT_GATEWAY: n = find_lnode(lnode, ev.nr_lnodes, ev.lnodes); n->gateway = true; break; case EVENT_NOTIFY: case EVENT_BLOCK: break; case EVENT_UPDATE_NODE: n = find_lnode(lnode, ev.nr_lnodes, ev.lnodes); n->node = lnode->node; break; case EVENT_JOIN_RESPONSE: abort(); } sd_dprintf("type = %d, sender = %s", ev.type, lnode_to_str(&ev.sender)); for (i = 0; i < ev.nr_lnodes; i++) sd_dprintf("%d: %s", i, lnode_to_str(ev.lnodes + i)); shm_queue_push(&ev); shm_queue_notify(); }
static int add_event(enum local_event_type type, struct local_node *lnode, void *buf, size_t buf_len) { struct local_node *n; struct local_event ev = { .type = type, .sender = *lnode, }; ev.buf_len = buf_len; if (buf) memcpy(ev.buf, buf, buf_len); ev.nr_lnodes = get_nodes(ev.lnodes); switch (type) { case EVENT_JOIN: ev.lnodes[ev.nr_lnodes] = *lnode; ev.nr_lnodes++; break; case EVENT_LEAVE: xlremove(lnode, ev.lnodes, &ev.nr_lnodes, lnode_cmp); break; case EVENT_GATEWAY: n = xlfind(lnode, ev.lnodes, ev.nr_lnodes, lnode_cmp); n->gateway = true; break; case EVENT_NOTIFY: case EVENT_BLOCK: break; case EVENT_UPDATE_NODE: n = xlfind(lnode, ev.lnodes, ev.nr_lnodes, lnode_cmp); n->node = lnode->node; break; case EVENT_ACCEPT: abort(); } sd_dprintf("type = %d, sender = %s", ev.type, lnode_to_str(&ev.sender)); for (int i = 0; i < ev.nr_lnodes; i++) sd_dprintf("%d: %s", i, lnode_to_str(ev.lnodes + i)); shm_queue_push(&ev); shm_queue_notify(); return SD_RES_SUCCESS; }
bool have_enough_zones(void) { int max_copies; if (sys->flags & SD_FLAG_NOHALT) return true; if (!current_vnode_info) return false; max_copies = get_max_copy_number(); sd_dprintf("flags %d, nr_zones %d, min copies %d\n", sys->flags, current_vnode_info->nr_zones, max_copies); if (!current_vnode_info->nr_zones) return false; if (sys->flags & SD_FLAG_QUORUM) { if (current_vnode_info->nr_zones > (max_copies/2)) return true; } else { if (current_vnode_info->nr_zones >= max_copies) return true; } return false; }
void recalculate_vnodes(struct sd_node *nodes, int nr_nodes) { int i, nr_non_gateway_nodes = 0; uint64_t avg_size = 0; float factor; for (i = 0; i < nr_nodes; i++) { if (nodes[i].space) { avg_size += nodes[i].space; nr_non_gateway_nodes++; } } if (!nr_non_gateway_nodes) return; avg_size /= nr_non_gateway_nodes; for (i = 0; i < nr_nodes; i++) { factor = (float)nodes[i].space / (float)avg_size; nodes[i].nr_vnodes = rintf(SD_DEFAULT_VNODES * factor); sd_dprintf("node %d has %d vnodes, free space %" PRIu64 "\n", nodes[i].nid.port, nodes[i].nr_vnodes, nodes[i].space); } }
static void recover_object_main(struct work *work) { struct recovery_work *rw = container_of(work, struct recovery_work, work); if (next_rw) { run_next_rw(rw); return; } if (rw->stop) { /* * Stop this recovery process and wait for epoch to be * lifted and flush wait_obj queue to requeue those * requests */ flush_wait_obj_requests(); sd_dprintf("recovery is stopped"); return; } resume_wait_obj_requests(rw->oids[rw->done++]); if (rw->done < rw->count) { recover_next_object(rw); return; } finish_recovery(rw); }
static int get_trunk_sha1(uint32_t epoch, unsigned char *outsha1) { int i, nr_logs = -1, ret = -1; struct snap_log *log_buf, *log_free = NULL; void *snap_buf = NULL; struct sha1_file_hdr hdr; log_free = log_buf = snap_log_read(&nr_logs); sd_dprintf("%d", nr_logs); if (nr_logs < 0) goto out; for (i = 0; i < nr_logs; i++, log_buf++) { if (log_buf->epoch != epoch) continue; snap_buf = snap_file_read(log_buf->sha1, &hdr); if (!snap_buf) goto out; memcpy(outsha1, snap_buf, SHA1_LEN); ret = 0; break; } out: free(log_free); free(snap_buf); return ret; }
/* * Recover the object from its track in epoch history. That is, * the routine will try to recovery it from the nodes it has stayed, * at least, *theoretically* on consistent hash ring. */ static int do_recover_object(struct recovery_work *rw) { struct vnode_info *old; uint64_t oid = rw->oids[rw->done]; uint32_t epoch = rw->epoch, tgt_epoch = rw->epoch - 1; int nr_copies, ret, i; old = grab_vnode_info(rw->old_vinfo); again: sd_dprintf("try recover object %"PRIx64" from epoch %"PRIu32, oid, tgt_epoch); /* Let's do a breadth-first search */ nr_copies = get_obj_copy_number(oid, old->nr_zones); for (i = 0; i < nr_copies; i++) { const struct sd_vnode *tgt_vnode; tgt_vnode = oid_to_vnode(old->vnodes, old->nr_vnodes, oid, i); if (is_invalid_vnode(tgt_vnode, rw->cur_vinfo->nodes, rw->cur_vinfo->nr_nodes)) continue; ret = recover_object_from_replica(oid, tgt_vnode, epoch, tgt_epoch); if (ret == SD_RES_SUCCESS) { /* Succeed */ break; } else if (SD_RES_OLD_NODE_VER == ret) { rw->stop = true; goto err; } else ret = -1; } /* No luck, roll back to an older configuration and try again */ if (ret < 0) { struct vnode_info *new_old; rollback: tgt_epoch--; if (tgt_epoch < 1) { sd_eprintf("can not recover oid %"PRIx64, oid); ret = -1; goto err; } new_old = get_vnode_info_epoch(tgt_epoch); if (!new_old) /* We rollback in case we don't get a valid epoch */ goto rollback; put_vnode_info(old); old = new_old; goto again; } err: put_vnode_info(old); return ret; }
static int farm_snapshot(const struct siocb *iocb) { unsigned char snap_sha1[SHA1_LEN]; unsigned char trunk_sha1[SHA1_LEN]; struct sd_node nodes[SD_MAX_NODES]; int nr_nodes; void *buffer; int log_nr, ret = SD_RES_EIO, epoch; buffer = snap_log_read(&log_nr); if (!buffer) goto out; epoch = log_nr + 1; sd_dprintf("user epoch %d", epoch); nr_nodes = epoch_log_read(sys->epoch, nodes, sizeof(nodes)); if (nr_nodes < 0) goto out; if (trunk_file_write(trunk_sha1) < 0) goto out; if (snap_file_write(sys->epoch, nodes, nr_nodes, trunk_sha1, snap_sha1) < 0) goto out; if (snap_log_write(epoch, snap_sha1) < 0) goto out; ret = SD_RES_SUCCESS; out: free(buffer); return ret; }
int sheep_exec_req(const struct node_id *nid, struct sd_req *hdr, void *buf) { struct sd_rsp *rsp = (struct sd_rsp *)hdr; struct sockfd *sfd; int ret; assert(is_worker_thread()); sfd = sockfd_cache_get(nid); if (!sfd) return SD_RES_NETWORK_ERROR; ret = exec_req(sfd->fd, hdr, buf, sheep_need_retry, hdr->epoch, MAX_RETRY_COUNT); if (ret) { sd_dprintf("remote node might have gone away"); sockfd_cache_del(nid, sfd); return SD_RES_NETWORK_ERROR; } ret = rsp->result; if (ret != SD_RES_SUCCESS) sd_eprintf("failed %s", sd_strerror(ret)); sockfd_cache_put(nid, sfd); return ret; }
/* * Schedule prio_oids to be recovered first in FIFO order * * rw->done is index of the original next object to be recovered and also the * number of objects already recovered. * we just move rw->prio_oids in between: * new_oids = [0..rw->done - 1] + [rw->prio_oids] + [rw->done] */ static inline void finish_schedule_oids(struct recovery_work *rw) { int i, nr_recovered = rw->done, new_idx; uint64_t *new_oids; /* If I am the last oid, done */ if (nr_recovered == rw->count - 1) goto done; new_oids = xmalloc(list_buffer_size); memcpy(new_oids, rw->oids, nr_recovered * sizeof(uint64_t)); memcpy(new_oids + nr_recovered, rw->prio_oids, rw->nr_prio_oids * sizeof(uint64_t)); new_idx = nr_recovered + rw->nr_prio_oids; for (i = rw->done; i < rw->count; i++) { if (oid_in_prio_oids(rw, rw->oids[i])) continue; new_oids[new_idx++] = rw->oids[i]; } /* rw->count should eq new_idx, otherwise something is wrong */ sd_dprintf("%snr_recovered %d, nr_prio_oids %d, count %d = new %d", rw->count == new_idx ? "" : "WARN: ", nr_recovered, rw->nr_prio_oids, rw->count, new_idx); free(rw->oids); rw->oids = new_oids; done: free(rw->prio_oids); rw->prio_oids = NULL; rw->nr_scheduled_prio_oids += rw->nr_prio_oids; rw->nr_prio_oids = 0; }
static inline void run_next_rw(struct recovery_work *rw) { free_recovery_work(rw); rw = next_rw; next_rw = NULL; recovering_work = rw; flush_wait_obj_requests(); queue_work(sys->recovery_wqueue, &rw->work); sd_dprintf("recovery work is superseded"); }
int init_disk_space(const char *base_path) { int ret = SD_RES_SUCCESS; uint64_t space_size = 0, mds; struct statvfs fs; if (sys->gateway_only) goto out; /* We need to init md even we don't need to update sapce */ mds = md_init_space(); /* If it is restarted */ ret = get_node_space(&space_size); if (space_size != 0) { sys->disk_space = space_size; goto out; } /* User has specified the space at startup */ if (sys->disk_space) { ret = set_node_space(sys->disk_space); goto out; } if (mds) { sys->disk_space = mds; } else { ret = statvfs(base_path, &fs); if (ret < 0) { sd_dprintf("get disk space failed %m"); ret = SD_RES_EIO; goto out; } sys->disk_space = (uint64_t)fs.f_frsize * fs.f_bfree; } ret = set_node_space(sys->disk_space); out: sd_dprintf("disk free space is %" PRIu64, sys->disk_space); return ret; }
int default_init(void) { int ret; sd_dprintf("use plain store driver"); ret = for_each_obj_path(make_stale_dir); if (ret != SD_RES_SUCCESS) return ret; return for_each_object_in_wd(init_objlist_and_vdi_bitmap, true, NULL); }
static int post_cluster_new_vdi(const struct sd_req *req, struct sd_rsp *rsp, void *data) { unsigned long nr = rsp->vdi.vdi_id; int ret = rsp->result; sd_dprintf("done %d %ld\n", ret, nr); if (ret == SD_RES_SUCCESS) set_bit(nr, sys->vdi_inuse); return ret; }
/* Prepare the object list that belongs to this node */ static void prepare_object_list(struct work *work) { struct recovery_work *rw = container_of(work, struct recovery_work, work); struct sd_node *cur = rw->cur_vinfo->nodes; int cur_nr = rw->cur_vinfo->nr_nodes; int start = random() % cur_nr, i, end = cur_nr; uint64_t *oids; sd_dprintf("%u", rw->epoch); wait_get_vdis_done(); again: /* We need to start at random node for better load balance */ for (i = start; i < end; i++) { size_t nr_oids; struct sd_node *node = cur + i; if (next_rw) { sd_dprintf("go to the next recovery"); return; } if (newly_joined(node, rw)) /* new node doesn't have a list file */ continue; oids = fetch_object_list(node, rw->epoch, &nr_oids); if (!oids) continue; screen_object_list(rw, oids, nr_oids); free(oids); } if (start != 0) { end = start; start = 0; goto again; } sd_dprintf("%d", rw->count); }
static void shm_queue_notify(void) { int i; size_t nr; struct local_node lnodes[SD_MAX_NODES]; nr = get_nodes(lnodes); for (i = 0; i < nr; i++) { sd_dprintf("send signal to %s", lnode_to_str(lnodes + i)); kill(lnodes[i].pid, SIGUSR1); } }
/* * If the node is gateway, this function only finds the store driver. * Otherwise, this function initializes the backend store */ int init_store_driver(bool is_gateway) { char driver_name[STORE_LEN], *p; int ret; memset(driver_name, '\0', sizeof(driver_name)); ret = get_cluster_store(driver_name); if (ret != SD_RES_SUCCESS) return ret; p = memchr(driver_name, '\0', STORE_LEN); if (!p) { /* * If the driver name is not NUL terminated we are in deep * trouble, let's get out here. */ sd_dprintf("store name not NUL terminated"); return SD_RES_NO_STORE; } /* * The store file might not exist in case this is a new sheep that * never joined a cluster before. */ if (p == driver_name) return 0; sd_store = find_store_driver(driver_name); if (!sd_store) { sd_dprintf("store %s not found", driver_name); return SD_RES_NO_STORE; } if (is_gateway) return SD_RES_SUCCESS; return sd_store->init(); }
int default_format(void) { unsigned ret; sd_dprintf("try get a clean store"); ret = for_each_obj_path(purge_dir); if (ret != SD_RES_SUCCESS) return ret; if (sys->enable_object_cache) object_cache_format(); return SD_RES_SUCCESS; }
static int init_objlist_and_vdi_bitmap(uint64_t oid, char *wd, void *arg) { int ret; objlist_cache_insert(oid); if (is_vdi_obj(oid)) { sd_dprintf("found the VDI object %" PRIx64, oid); set_bit(oid_to_vid(oid), sys->vdi_inuse); ret = init_vdi_copy_number(oid, wd); if (ret != SD_RES_SUCCESS) return ret; } return SD_RES_SUCCESS; }
static int err_to_sderr(char *path, uint64_t oid, int err) { struct stat s; char *dir = dirname(path); sd_dprintf("%s", dir); switch (err) { case ENOENT: if (stat(dir, &s) < 0) { sd_eprintf("%s corrupted", dir); return md_handle_eio(dir); } sd_dprintf("object %016" PRIx64 " not found locally", oid); return SD_RES_NO_OBJ; case ENOSPC: /* TODO: stop automatic recovery */ sd_eprintf("diskfull, oid=%"PRIx64, oid); return SD_RES_NO_SPACE; default: sd_eprintf("oid=%"PRIx64", %m", oid); return md_handle_eio(dir); } }
static inline void calculate_vdisks(struct disk *disks, int nr_disks, uint64_t total) { uint64_t avg_size = total / nr_disks; float factor; int i; for (i = 0; i < nr_disks; i++) { factor = (float)disks[i].space / (float)avg_size; md_disks[i].nr_vdisks = rintf(MD_DEFAULT_VDISKS * factor); sd_dprintf("%s has %d vdisks, free space %" PRIu64, md_disks[i].path, md_disks[i].nr_vdisks, md_disks[i].space); } }
int default_init(const char *p) { sd_dprintf("use plain store driver\n"); /* create a stale directory */ snprintf(stale_dir, sizeof(stale_dir), "%s/.stale", p); if (mkdir(stale_dir, 0755) < 0) { if (errno != EEXIST) { sd_eprintf("%m\n"); return SD_RES_EIO; } } return for_each_object_in_wd(init_objlist_and_vdi_bitmap, true, NULL); }
/* Fetch the object list from all the nodes in the cluster */ static uint64_t *fetch_object_list(struct sd_node *e, uint32_t epoch, size_t *nr_oids) { char name[128]; struct sd_list_req hdr; struct sd_list_rsp *rsp = (struct sd_list_rsp *)&hdr; size_t buf_size = list_buffer_size; uint64_t *buf = xmalloc(buf_size); int ret; addr_to_str(name, sizeof(name), e->nid.addr, 0); sd_dprintf("%s %"PRIu32, name, e->nid.port); retry: sd_init_req((struct sd_req *)&hdr, SD_OP_GET_OBJ_LIST); hdr.tgt_epoch = epoch - 1; hdr.data_length = buf_size; hdr.epoch = sys_epoch(); ret = sheep_exec_req(&e->nid, (struct sd_req *)&hdr, buf); switch (ret) { case SD_RES_SUCCESS: break; case SD_RES_BUFFER_SMALL: buf_size *= 2; buf = xrealloc(buf, buf_size); goto retry; default: free(buf); return NULL; } *nr_oids = rsp->data_length / sizeof(uint64_t); sd_dprintf("%zu", *nr_oids); return buf; }
char *get_object_path(uint64_t oid) { struct vdisk *vd; char *p; if (!sys->enable_md) return obj_path; pthread_rwlock_rdlock(&md_lock); vd = oid_to_vdisk(oid); p = md_disks[vd->idx].path; pthread_rwlock_unlock(&md_lock); sd_dprintf("%d, %s", vd->idx, p); return p; }
static inline void finish_recovery(struct recovery_work *rw) { recovering_work = NULL; sys->recovered_epoch = rw->epoch; if (sd_store->end_recover) sd_store->end_recover(sys->epoch - 1, rw->old_vinfo); /* notify recovery completion to other nodes */ rw->work.fn = notify_recovery_completion_work; rw->work.done = notify_recovery_completion_main; queue_work(sys->recovery_wqueue, &rw->work); sd_dprintf("recovery complete: new epoch %"PRIu32, sys->recovered_epoch); }
static int recover_object_from_replica(uint64_t oid, const struct sd_vnode *vnode, uint32_t epoch, uint32_t tgt_epoch) { struct sd_req hdr; struct sd_rsp *rsp = (struct sd_rsp *)&hdr; unsigned rlen; int ret = SD_RES_NO_MEM; void *buf = NULL; struct siocb iocb = { 0 }; if (vnode_is_local(vnode)) { ret = sd_store->link(oid, tgt_epoch); goto out; } rlen = get_objsize(oid); buf = valloc(rlen); if (!buf) { sd_eprintf("%m"); goto out; } sd_init_req(&hdr, SD_OP_READ_PEER); hdr.epoch = epoch; hdr.flags = SD_FLAG_CMD_RECOVERY; hdr.data_length = rlen; hdr.obj.oid = oid; hdr.obj.tgt_epoch = tgt_epoch; ret = sheep_exec_req(&vnode->nid, &hdr, buf); if (ret != SD_RES_SUCCESS) goto out; iocb.epoch = epoch; iocb.length = rsp->data_length; iocb.offset = rsp->obj.offset; iocb.buf = buf; ret = sd_store->create_and_write(oid, &iocb); out: if (ret == SD_RES_SUCCESS) { sd_dprintf("recovered oid %"PRIx64" from %d to epoch %d", oid, tgt_epoch, epoch); objlist_cache_insert(oid); } free(buf); return ret; }