/* Screen out objects that don't belong to this node */ static void screen_object_list(struct recovery_work *rw, uint64_t *oids, int nr_oids) { struct sd_vnode *vnodes[SD_MAX_COPIES]; int old_count = rw->count; int nr_objs; int i, j; nr_objs = get_nr_copies(rw->cur_vnodes); for (i = 0; i < nr_oids; i++) { oid_to_vnodes(rw->cur_vnodes, oids[i], nr_objs, vnodes); for (j = 0; j < nr_objs; j++) { if (!vnode_is_local(vnodes[j])) continue; if (bsearch(&oids[i], rw->oids, old_count, sizeof(uint64_t), obj_cmp)) continue; rw->oids[rw->count++] = oids[i]; break; } } qsort(rw->oids, rw->count, sizeof(uint64_t), obj_cmp); }
/* * Recover the object from its track in epoch history. That is, * the routine will try to recovery it from the nodes it has stayed, * at least, *theoretically* on consistent hash ring. */ static int do_recover_object(struct recovery_work *rw) { struct vnode_info *old; uint64_t oid = rw->oids[rw->done]; uint32_t epoch = rw->epoch, tgt_epoch = rw->epoch - 1; int nr_copies, ret, i; old = grab_vnode_info(rw->old_vnodes); again: dprintf("try recover object %"PRIx64" from epoch %"PRIu32"\n", oid, tgt_epoch); /* Let's do a breadth-first search */ nr_copies = get_nr_copies(old); for (i = 0; i < nr_copies; i++) { struct sd_vnode *tgt_vnode = oid_to_vnode(old, oid, i); if (is_invalid_vnode(tgt_vnode, rw->cur_vnodes->nodes, rw->cur_vnodes->nr_nodes)) continue; ret = recover_object_from_replica(oid, tgt_vnode, epoch, tgt_epoch); if (ret == 0) { /* Succeed */ break; } else if (SD_RES_OLD_NODE_VER == ret) { rw->stop = 1; goto err; } else ret = -1; } /* No luck, roll back to an older configuration and try again */ if (ret < 0) { struct vnode_info *new_old; tgt_epoch--; if (tgt_epoch < 1) { eprintf("can not recover oid %"PRIx64"\n", oid); ret = -1; goto err; } new_old = get_vnode_info_epoch(tgt_epoch); if (!new_old) { ret = -1; goto err; } put_vnode_info(old); old = new_old; goto again; } err: put_vnode_info(old); return ret; }
static int forward_read_obj_req(struct request *req) { int i, fd, ret = SD_RES_SUCCESS; unsigned wlen, rlen; struct sd_obj_req hdr = *(struct sd_obj_req *)&req->rq; struct sd_obj_rsp *rsp = (struct sd_obj_rsp *)&hdr; struct sd_vnode *v; uint64_t oid = hdr.oid; int nr_copies; hdr.flags |= SD_FLAG_CMD_IO_LOCAL; if (hdr.copies) nr_copies = hdr.copies; else nr_copies = get_nr_copies(req->vnodes); /* TODO: we can do better; we need to check this first */ for (i = 0; i < nr_copies; i++) { v = oid_to_vnode(req->vnodes, oid, i); if (vnode_is_local(v)) { ret = do_local_io(req, hdr.epoch); if (ret != SD_RES_SUCCESS) goto read_remote; return ret; } } read_remote: for (i = 0; i < nr_copies; i++) { v = oid_to_vnode(req->vnodes, oid, i); if (vnode_is_local(v)) continue; fd = get_sheep_fd(v->addr, v->port, v->node_idx, hdr.epoch); if (fd < 0) { ret = SD_RES_NETWORK_ERROR; continue; } wlen = 0; rlen = hdr.data_length; ret = exec_req(fd, (struct sd_req *)&hdr, req->data, &wlen, &rlen); if (ret) { /* network errors */ del_sheep_fd(fd); ret = SD_RES_NETWORK_ERROR; continue; } else { memcpy(&req->rp, rsp, sizeof(*rsp)); ret = rsp->result; break; } } return ret; }
int forward_write_obj_req(struct request *req) { int i, fd, ret, pollret; unsigned wlen; char name[128]; struct sd_obj_req hdr = *(struct sd_obj_req *)&req->rq; struct sd_obj_rsp *rsp = (struct sd_obj_rsp *)&req->rp; struct sd_vnode *v; uint64_t oid = hdr.oid; int nr_copies; struct pollfd pfds[SD_MAX_REDUNDANCY]; int nr_fds, local = 0; dprintf("%"PRIx64"\n", oid); nr_fds = 0; memset(pfds, 0, sizeof(pfds)); for (i = 0; i < ARRAY_SIZE(pfds); i++) pfds[i].fd = -1; hdr.flags |= SD_FLAG_CMD_IO_LOCAL; wlen = hdr.data_length; nr_copies = get_nr_copies(req->vnodes); for (i = 0; i < nr_copies; i++) { v = oid_to_vnode(req->vnodes, oid, i); addr_to_str(name, sizeof(name), v->addr, 0); if (vnode_is_local(v)) { local = 1; continue; } fd = get_sheep_fd(v->addr, v->port, v->node_idx, hdr.epoch); if (fd < 0) { eprintf("failed to connect to %s:%"PRIu32"\n", name, v->port); ret = SD_RES_NETWORK_ERROR; goto out; } ret = send_req(fd, (struct sd_req *)&hdr, req->data, &wlen); if (ret) { /* network errors */ del_sheep_fd(fd); ret = SD_RES_NETWORK_ERROR; dprintf("fail %"PRIu32"\n", ret); goto out; } pfds[nr_fds].fd = fd; pfds[nr_fds].events = POLLIN; nr_fds++; } if (local) { ret = do_local_io(req, hdr.epoch); rsp->result = ret; if (nr_fds == 0) { eprintf("exit %"PRIu32"\n", ret); goto out; } if (rsp->result != SD_RES_SUCCESS) { eprintf("fail %"PRIu32"\n", ret); goto out; } } ret = SD_RES_SUCCESS; again: pollret = poll(pfds, nr_fds, DEFAULT_SOCKET_TIMEOUT * 1000); if (pollret < 0) { if (errno == EINTR) goto again; ret = SD_RES_EIO; } else if (pollret == 0) { /* poll time out */ eprintf("timeout\n"); for (i = 0; i < nr_fds; i++) del_sheep_fd(pfds[i].fd); ret = SD_RES_NETWORK_ERROR; goto out; } for (i = 0; i < nr_fds; i++) { if (pfds[i].fd < 0) break; if (pfds[i].revents & POLLERR || pfds[i].revents & POLLHUP || pfds[i].revents & POLLNVAL) { del_sheep_fd(pfds[i].fd); ret = SD_RES_NETWORK_ERROR; break; } if (!(pfds[i].revents & POLLIN)) continue; if (do_read(pfds[i].fd, rsp, sizeof(*rsp))) { eprintf("failed to read a response: %m\n"); del_sheep_fd(pfds[i].fd); ret = SD_RES_NETWORK_ERROR; break; } if (rsp->result != SD_RES_SUCCESS) { eprintf("fail %"PRIu32"\n", rsp->result); ret = rsp->result; } break; } if (i < nr_fds) { nr_fds--; memmove(pfds + i, pfds + i + 1, sizeof(*pfds) * (nr_fds - i)); } dprintf("%"PRIx64" %"PRIu32"\n", oid, nr_fds); if (nr_fds > 0) { goto again; } out: return ret; }