/* * Recover the object from its track in epoch history. That is, * the routine will try to recovery it from the nodes it has stayed, * at least, *theoretically* on consistent hash ring. */ static int do_recover_object(struct recovery_work *rw) { struct vnode_info *old; uint64_t oid = rw->oids[rw->done]; uint32_t epoch = rw->epoch, tgt_epoch = rw->epoch - 1; int nr_copies, ret, i; old = grab_vnode_info(rw->old_vinfo); again: sd_dprintf("try recover object %"PRIx64" from epoch %"PRIu32, oid, tgt_epoch); /* Let's do a breadth-first search */ nr_copies = get_obj_copy_number(oid, old->nr_zones); for (i = 0; i < nr_copies; i++) { const struct sd_vnode *tgt_vnode; tgt_vnode = oid_to_vnode(old->vnodes, old->nr_vnodes, oid, i); if (is_invalid_vnode(tgt_vnode, rw->cur_vinfo->nodes, rw->cur_vinfo->nr_nodes)) continue; ret = recover_object_from_replica(oid, tgt_vnode, epoch, tgt_epoch); if (ret == SD_RES_SUCCESS) { /* Succeed */ break; } else if (SD_RES_OLD_NODE_VER == ret) { rw->stop = true; goto err; } else ret = -1; } /* No luck, roll back to an older configuration and try again */ if (ret < 0) { struct vnode_info *new_old; rollback: tgt_epoch--; if (tgt_epoch < 1) { sd_eprintf("can not recover oid %"PRIx64, oid); ret = -1; goto err; } new_old = get_vnode_info_epoch(tgt_epoch); if (!new_old) /* We rollback in case we don't get a valid epoch */ goto rollback; put_vnode_info(old); old = new_old; goto again; } err: put_vnode_info(old); return ret; }
static void free_recovery_work(struct recovery_work *rw) { put_vnode_info(rw->cur_vinfo); put_vnode_info(rw->old_vinfo); free(rw->oids); free(rw); }
static inline void kick_recover(void) { struct vnode_info *vinfo = get_vnode_info(); start_recovery(vinfo, vinfo); put_vnode_info(vinfo); }
static bool oid_stale(uint64_t oid) { int i, nr_copies; struct vnode_info *vinfo; struct sd_vnode *v; bool ret = true; struct sd_vnode *obj_vnodes[SD_MAX_COPIES]; vinfo = get_vnode_info(); nr_copies = get_obj_copy_number(oid, vinfo->nr_zones); if (!nr_copies) { ret = false; goto out; } oid_to_vnodes(vinfo->vnodes, vinfo->nr_vnodes, oid, nr_copies, obj_vnodes); for (i = 0; i < nr_copies; i++) { v = obj_vnodes[i]; if (vnode_is_local(v)) { ret = false; break; } } out: put_vnode_info(vinfo); return ret; }
static size_t get_nr_nodes(void) { struct vnode_info *vinfo; size_t nr = 1; vinfo = get_vnode_info(); if (vinfo != NULL) nr = vinfo->nr_nodes; put_vnode_info(vinfo); return nr; }
static int update_vnode_info(void) { struct vnode_info *vnode_info; vnode_info = zalloc(sizeof(*vnode_info)); if (!vnode_info) { eprintf("failed to allocate memory\n"); return 1; } vnode_info->nr_vnodes = nodes_to_vnodes(sys->nodes, sys->nr_nodes, vnode_info->entries); vnode_info->nr_zones = get_zones_nr_from(sys->nodes, sys->nr_nodes); vnode_info->refcnt = 1; put_vnode_info(current_vnode_info); current_vnode_info = vnode_info; return 0; }
static void update_cluster_info(struct join_message *msg, struct sd_node *joined, struct sd_node *nodes, size_t nr_nodes) { struct vnode_info *old_vnode_info = NULL; eprintf("status = %d, epoch = %d, finished: %d\n", msg->cluster_status, msg->epoch, sys->join_finished); sys->disable_recovery = msg->disable_recovery; if (!sys->join_finished) finish_join(msg, joined, nodes, nr_nodes); if (!sys->disable_recovery) { old_vnode_info = current_vnode_info; current_vnode_info = alloc_vnode_info(nodes, nr_nodes); } switch (msg->cluster_status) { case SD_STATUS_OK: case SD_STATUS_HALT: switch (sys->status) { case SD_STATUS_WAIT_FOR_FORMAT: sys->nr_copies = msg->nr_copies; sys->flags = msg->cluster_flags; set_cluster_copies(sys->nr_copies); set_cluster_flags(sys->flags); set_cluster_ctime(msg->ctime); /*FALLTHROUGH*/ case SD_STATUS_WAIT_FOR_JOIN: get_vdi_bitmap(nodes, nr_nodes); break; default: break; } sys->status = msg->cluster_status; if (msg->inc_epoch) { if (!sys->disable_recovery) { uatomic_inc(&sys->epoch); log_current_epoch(); clear_exceptional_node_lists(); if (!old_vnode_info) { old_vnode_info = alloc_old_vnode_info(joined, nodes, nr_nodes); } start_recovery(current_vnode_info, old_vnode_info); } else prepare_recovery(joined, nodes, nr_nodes); } if (have_enough_zones()) sys->status = SD_STATUS_OK; break; default: sys->status = msg->cluster_status; break; } put_vnode_info(old_vnode_info); sockfd_cache_add(&joined->nid); }
/* * update currently active vnode information structure, * this must only be called from the main thread. */ void update_vnode_info(struct vnode_info *vnode_info) { put_vnode_info(current_vnode_info); current_vnode_info = vnode_info; }