/* * Recover the object from its track in epoch history. That is, * the routine will try to recovery it from the nodes it has stayed, * at least, *theoretically* on consistent hash ring. */ static int do_recover_object(struct recovery_work *rw) { struct vnode_info *old; uint64_t oid = rw->oids[rw->done]; uint32_t epoch = rw->epoch, tgt_epoch = rw->epoch - 1; int nr_copies, ret, i; old = grab_vnode_info(rw->old_vinfo); again: sd_dprintf("try recover object %"PRIx64" from epoch %"PRIu32, oid, tgt_epoch); /* Let's do a breadth-first search */ nr_copies = get_obj_copy_number(oid, old->nr_zones); for (i = 0; i < nr_copies; i++) { const struct sd_vnode *tgt_vnode; tgt_vnode = oid_to_vnode(old->vnodes, old->nr_vnodes, oid, i); if (is_invalid_vnode(tgt_vnode, rw->cur_vinfo->nodes, rw->cur_vinfo->nr_nodes)) continue; ret = recover_object_from_replica(oid, tgt_vnode, epoch, tgt_epoch); if (ret == SD_RES_SUCCESS) { /* Succeed */ break; } else if (SD_RES_OLD_NODE_VER == ret) { rw->stop = true; goto err; } else ret = -1; } /* No luck, roll back to an older configuration and try again */ if (ret < 0) { struct vnode_info *new_old; rollback: tgt_epoch--; if (tgt_epoch < 1) { sd_eprintf("can not recover oid %"PRIx64, oid); ret = -1; goto err; } new_old = get_vnode_info_epoch(tgt_epoch); if (!new_old) /* We rollback in case we don't get a valid epoch */ goto rollback; put_vnode_info(old); old = new_old; goto again; } err: put_vnode_info(old); return ret; }
int start_recovery(struct vnode_info *cur_vinfo, struct vnode_info *old_vinfo) { struct recovery_work *rw; if (node_is_gateway_only()) return 0; rw = zalloc(sizeof(struct recovery_work)); if (!rw) { eprintf("%m\n"); return -1; } rw->state = RW_INIT; rw->oids = xmalloc(1 << 20); /* FIXME */ rw->epoch = sys->epoch; rw->count = 0; rw->cur_vinfo = grab_vnode_info(cur_vinfo); rw->old_vinfo = grab_vnode_info(old_vinfo); rw->work.fn = prepare_object_list; rw->work.done = finish_object_list; if (sd_store->begin_recover) { struct siocb iocb = { 0 }; iocb.epoch = rw->epoch; sd_store->begin_recover(&iocb); } if (recovering_work != NULL) { /* skip the previous epoch recovery */ if (next_rw) free_recovery_work(next_rw); dprintf("recovery skipped\n"); next_rw = rw; } else { recovering_work = rw; queue_work(sys->recovery_wqueue, &rw->work); } resume_wait_epoch_requests(); return 0; }
int start_recovery(struct vnode_info *cur_vinfo, struct vnode_info *old_vinfo) { struct recovery_work *rw; if (node_is_gateway_only()) goto out; rw = xzalloc(sizeof(struct recovery_work)); rw->state = RW_INIT; rw->oids = xmalloc(list_buffer_size); rw->epoch = sys->epoch; rw->count = 0; rw->cur_vinfo = grab_vnode_info(cur_vinfo); rw->old_vinfo = grab_vnode_info(old_vinfo); rw->work.fn = prepare_object_list; rw->work.done = finish_object_list; if (recovering_work != NULL) { /* skip the previous epoch recovery */ if (next_rw) free_recovery_work(next_rw); sd_dprintf("recovery skipped"); next_rw = rw; /* * This is necesary to invoke run_next_rw when * recovery work is suspended. */ resume_suspended_recovery(); } else { recovering_work = rw; queue_work(sys->recovery_wqueue, &rw->work); } out: wakeup_requests_on_epoch(); return 0; }
/* * Get a reference to the currently active vnode information structure, * this must only be called from the main thread. */ struct vnode_info *get_vnode_info(void) { assert(current_vnode_info); return grab_vnode_info(current_vnode_info); }