/* * Try our best to read one copy and read local first. * * Return success if any read succeed. We don't call gateway_forward_request() * because we only read once. */ int gateway_read_obj(struct request *req) { int i, ret = SD_RES_SUCCESS; unsigned wlen, rlen; struct sd_req fwd_hdr; struct sd_rsp *rsp = (struct sd_rsp *)&fwd_hdr; struct sd_vnode *v; struct sd_vnode *obj_vnodes[SD_MAX_COPIES]; uint64_t oid = req->rq.obj.oid; int nr_copies, j; if (is_object_cache_enabled() && !req->local && !bypass_object_cache(req)) return object_cache_handle_request(req); nr_copies = get_req_copy_number(req); oid_to_vnodes(req->vinfo->vnodes, req->vinfo->nr_vnodes, oid, nr_copies, obj_vnodes); for (i = 0; i < nr_copies; i++) { v = obj_vnodes[i]; if (!vnode_is_local(v)) continue; ret = peer_read_obj(req); if (ret == SD_RES_SUCCESS) return ret; eprintf("local read fail %x\n", ret); break; } /* * Read random copy from cluster for better load balance, useful for * reading base VM's COW objects */ j = random(); for (i = 0; i < nr_copies; i++) { int idx = (i + j) % nr_copies; v = obj_vnodes[idx]; if (vnode_is_local(v)) continue; /* * We need to re-init it because rsp and req share the same * structure. */ gateway_init_fwd_hdr(&fwd_hdr, &req->rq); wlen = 0; rlen = fwd_hdr.data_length; ret = sheep_exec_req(&v->nid, &fwd_hdr, req->data, &wlen, &rlen); if (ret != SD_RES_SUCCESS) continue; /* Read success */ memcpy(&req->rp, rsp, sizeof(*rsp)); break; } return ret; }
static int forward_read_obj_req(struct request *req) { int i, fd, ret = SD_RES_SUCCESS; unsigned wlen, rlen; struct sd_obj_req hdr = *(struct sd_obj_req *)&req->rq; struct sd_obj_rsp *rsp = (struct sd_obj_rsp *)&hdr; struct sd_vnode *v; uint64_t oid = hdr.oid; int nr_copies; hdr.flags |= SD_FLAG_CMD_IO_LOCAL; if (hdr.copies) nr_copies = hdr.copies; else nr_copies = get_nr_copies(req->vnodes); /* TODO: we can do better; we need to check this first */ for (i = 0; i < nr_copies; i++) { v = oid_to_vnode(req->vnodes, oid, i); if (vnode_is_local(v)) { ret = do_local_io(req, hdr.epoch); if (ret != SD_RES_SUCCESS) goto read_remote; return ret; } } read_remote: for (i = 0; i < nr_copies; i++) { v = oid_to_vnode(req->vnodes, oid, i); if (vnode_is_local(v)) continue; fd = get_sheep_fd(v->addr, v->port, v->node_idx, hdr.epoch); if (fd < 0) { ret = SD_RES_NETWORK_ERROR; continue; } wlen = 0; rlen = hdr.data_length; ret = exec_req(fd, (struct sd_req *)&hdr, req->data, &wlen, &rlen); if (ret) { /* network errors */ del_sheep_fd(fd); ret = SD_RES_NETWORK_ERROR; continue; } else { memcpy(&req->rp, rsp, sizeof(*rsp)); ret = rsp->result; break; } } return ret; }
static bool oid_stale(uint64_t oid) { int i, nr_copies; struct vnode_info *vinfo; struct sd_vnode *v; bool ret = true; struct sd_vnode *obj_vnodes[SD_MAX_COPIES]; vinfo = get_vnode_info(); nr_copies = get_obj_copy_number(oid, vinfo->nr_zones); if (!nr_copies) { ret = false; goto out; } oid_to_vnodes(vinfo->vnodes, vinfo->nr_vnodes, oid, nr_copies, obj_vnodes); for (i = 0; i < nr_copies; i++) { v = obj_vnodes[i]; if (vnode_is_local(v)) { ret = false; break; } } out: put_vnode_info(vinfo); return ret; }
/* Screen out objects that don't belong to this node */ static void screen_object_list(struct recovery_work *rw, uint64_t *oids, int nr_oids) { struct sd_vnode *vnodes[SD_MAX_COPIES]; int old_count = rw->count; int nr_objs; int i, j; nr_objs = get_nr_copies(rw->cur_vnodes); for (i = 0; i < nr_oids; i++) { oid_to_vnodes(rw->cur_vnodes, oids[i], nr_objs, vnodes); for (j = 0; j < nr_objs; j++) { if (!vnode_is_local(vnodes[j])) continue; if (bsearch(&oids[i], rw->oids, old_count, sizeof(uint64_t), obj_cmp)) continue; rw->oids[rw->count++] = oids[i]; break; } } qsort(rw->oids, rw->count, sizeof(uint64_t), obj_cmp); }
static int recover_object_from_replica(uint64_t oid, const struct sd_vnode *vnode, uint32_t epoch, uint32_t tgt_epoch) { struct sd_req hdr; struct sd_rsp *rsp = (struct sd_rsp *)&hdr; unsigned rlen; int ret = SD_RES_NO_MEM; void *buf = NULL; struct siocb iocb = { 0 }; if (vnode_is_local(vnode)) { ret = sd_store->link(oid, tgt_epoch); goto out; } rlen = get_objsize(oid); buf = valloc(rlen); if (!buf) { sd_eprintf("%m"); goto out; } sd_init_req(&hdr, SD_OP_READ_PEER); hdr.epoch = epoch; hdr.flags = SD_FLAG_CMD_RECOVERY; hdr.data_length = rlen; hdr.obj.oid = oid; hdr.obj.tgt_epoch = tgt_epoch; ret = sheep_exec_req(&vnode->nid, &hdr, buf); if (ret != SD_RES_SUCCESS) goto out; iocb.epoch = epoch; iocb.length = rsp->data_length; iocb.offset = rsp->obj.offset; iocb.buf = buf; ret = sd_store->create_and_write(oid, &iocb); out: if (ret == SD_RES_SUCCESS) { sd_dprintf("recovered oid %"PRIx64" from %d to epoch %d", oid, tgt_epoch, epoch); objlist_cache_insert(oid); } free(buf); return ret; }
/* Screen out objects that don't belong to this node */ static void screen_object_list(struct recovery_work *rw, uint64_t *oids, int nr_oids) { struct sd_vnode *vnodes[SD_MAX_COPIES]; int old_count = rw->count; int nr_objs; int i, j; for (i = 0; i < nr_oids; i++) { again: nr_objs = get_obj_copy_number(oids[i], rw->cur_vinfo->nr_zones); if (!nr_objs) { dprintf("can not find copy number for object %" PRIx64 "\n", oids[i]); dprintf("probably, vdi was created but " "post_cluster_new_vdi() is not called yet\n"); /* FIXME: can we wait for post_cluster_new_vdi * with a better way? */ sleep(1); goto again; } oid_to_vnodes(rw->cur_vinfo->vnodes, rw->cur_vinfo->nr_vnodes, oids[i], nr_objs, vnodes); for (j = 0; j < nr_objs; j++) { if (!vnode_is_local(vnodes[j])) continue; if (bsearch(&oids[i], rw->oids, old_count, sizeof(uint64_t), obj_cmp)) continue; rw->oids[rw->count++] = oids[i]; break; } } qsort(rw->oids, rw->count, sizeof(uint64_t), obj_cmp); }
/* * For replicated object, if any of the replica belongs to this node, we * consider it not stale. * * For erasured object, since every copy is unique and if it migrates to other * node(index gets changed even it has some other copy belongs to it) because * of hash ring changes, we consider it stale. */ static bool oid_stale(uint64_t oid, int ec_index, struct vnode_info *vinfo) { uint32_t i, nr_copies; const struct sd_vnode *v; bool ret = true; const struct sd_vnode *obj_vnodes[SD_MAX_COPIES]; nr_copies = get_obj_copy_number(oid, vinfo->nr_zones); oid_to_vnodes(oid, &vinfo->vroot, nr_copies, obj_vnodes); for (i = 0; i < nr_copies; i++) { v = obj_vnodes[i]; if (vnode_is_local(v)) { if (ec_index < SD_MAX_COPIES) { if (i == ec_index) ret = false; } else { ret = false; } break; } } return ret; }
/* Screen out objects that don't belong to this node */ static void screen_object_list(struct recovery_work *rw, uint64_t *oids, size_t nr_oids) { const struct sd_vnode *vnodes[SD_MAX_COPIES]; int old_count = rw->count; int nr_objs; int i, j; for (i = 0; i < nr_oids; i++) { nr_objs = get_obj_copy_number(oids[i], rw->cur_vinfo->nr_zones); if (!nr_objs) { sd_eprintf("ERROR: can not find copy number for object" " %" PRIx64, oids[i]); continue; } oid_to_vnodes(rw->cur_vinfo->vnodes, rw->cur_vinfo->nr_vnodes, oids[i], nr_objs, vnodes); for (j = 0; j < nr_objs; j++) { if (!vnode_is_local(vnodes[j])) continue; if (bsearch(&oids[i], rw->oids, old_count, sizeof(uint64_t), obj_cmp)) continue; rw->oids[rw->count++] = oids[i]; /* enlarge the list buffer if full */ if (rw->count == list_buffer_size / sizeof(uint64_t)) { list_buffer_size *= 2; rw->oids = xrealloc(rw->oids, list_buffer_size); } break; } } qsort(rw->oids, rw->count, sizeof(uint64_t), obj_cmp); }
int forward_write_obj_req(struct request *req) { int i, fd, ret, pollret; unsigned wlen; char name[128]; struct sd_obj_req hdr = *(struct sd_obj_req *)&req->rq; struct sd_obj_rsp *rsp = (struct sd_obj_rsp *)&req->rp; struct sd_vnode *v; uint64_t oid = hdr.oid; int nr_copies; struct pollfd pfds[SD_MAX_REDUNDANCY]; int nr_fds, local = 0; dprintf("%"PRIx64"\n", oid); nr_fds = 0; memset(pfds, 0, sizeof(pfds)); for (i = 0; i < ARRAY_SIZE(pfds); i++) pfds[i].fd = -1; hdr.flags |= SD_FLAG_CMD_IO_LOCAL; wlen = hdr.data_length; nr_copies = get_nr_copies(req->vnodes); for (i = 0; i < nr_copies; i++) { v = oid_to_vnode(req->vnodes, oid, i); addr_to_str(name, sizeof(name), v->addr, 0); if (vnode_is_local(v)) { local = 1; continue; } fd = get_sheep_fd(v->addr, v->port, v->node_idx, hdr.epoch); if (fd < 0) { eprintf("failed to connect to %s:%"PRIu32"\n", name, v->port); ret = SD_RES_NETWORK_ERROR; goto out; } ret = send_req(fd, (struct sd_req *)&hdr, req->data, &wlen); if (ret) { /* network errors */ del_sheep_fd(fd); ret = SD_RES_NETWORK_ERROR; dprintf("fail %"PRIu32"\n", ret); goto out; } pfds[nr_fds].fd = fd; pfds[nr_fds].events = POLLIN; nr_fds++; } if (local) { ret = do_local_io(req, hdr.epoch); rsp->result = ret; if (nr_fds == 0) { eprintf("exit %"PRIu32"\n", ret); goto out; } if (rsp->result != SD_RES_SUCCESS) { eprintf("fail %"PRIu32"\n", ret); goto out; } } ret = SD_RES_SUCCESS; again: pollret = poll(pfds, nr_fds, DEFAULT_SOCKET_TIMEOUT * 1000); if (pollret < 0) { if (errno == EINTR) goto again; ret = SD_RES_EIO; } else if (pollret == 0) { /* poll time out */ eprintf("timeout\n"); for (i = 0; i < nr_fds; i++) del_sheep_fd(pfds[i].fd); ret = SD_RES_NETWORK_ERROR; goto out; } for (i = 0; i < nr_fds; i++) { if (pfds[i].fd < 0) break; if (pfds[i].revents & POLLERR || pfds[i].revents & POLLHUP || pfds[i].revents & POLLNVAL) { del_sheep_fd(pfds[i].fd); ret = SD_RES_NETWORK_ERROR; break; } if (!(pfds[i].revents & POLLIN)) continue; if (do_read(pfds[i].fd, rsp, sizeof(*rsp))) { eprintf("failed to read a response: %m\n"); del_sheep_fd(pfds[i].fd); ret = SD_RES_NETWORK_ERROR; break; } if (rsp->result != SD_RES_SUCCESS) { eprintf("fail %"PRIu32"\n", rsp->result); ret = rsp->result; } break; } if (i < nr_fds) { nr_fds--; memmove(pfds + i, pfds + i + 1, sizeof(*pfds) * (nr_fds - i)); } dprintf("%"PRIx64" %"PRIu32"\n", oid, nr_fds); if (nr_fds > 0) { goto again; } out: return ret; }
static int recover_object_from_replica(uint64_t oid, struct sd_vnode *entry, uint32_t epoch, uint32_t tgt_epoch) { struct sd_req hdr; struct sd_rsp *rsp = (struct sd_rsp *)&hdr; char name[128]; unsigned wlen = 0, rlen; int fd, ret = -1; void *buf; struct siocb iocb = { 0 }; rlen = get_objsize(oid); buf = valloc(rlen); if (!buf) { eprintf("%m\n"); goto out; } if (vnode_is_local(entry)) { iocb.epoch = epoch; iocb.length = rlen; ret = sd_store->link(oid, &iocb, tgt_epoch); if (ret == SD_RES_SUCCESS) { ret = 0; goto done; } else { ret = -1; goto out; } } addr_to_str(name, sizeof(name), entry->nid.addr, 0); fd = connect_to(name, entry->nid.port); dprintf("%s, %d\n", name, entry->nid.port); if (fd < 0) { eprintf("failed to connect to %s:%"PRIu32"\n", name, entry->nid.port); ret = -1; goto out; } sd_init_req(&hdr, SD_OP_READ_PEER); hdr.epoch = epoch; hdr.flags = SD_FLAG_CMD_RECOVERY; hdr.data_length = rlen; hdr.obj.oid = oid; hdr.obj.tgt_epoch = tgt_epoch; ret = exec_req(fd, &hdr, buf, &wlen, &rlen); close(fd); if (ret != 0) { eprintf("res: %"PRIx32"\n", rsp->result); ret = -1; goto out; } rsp = (struct sd_rsp *)&hdr; if (rsp->result == SD_RES_SUCCESS) { iocb.epoch = epoch; iocb.length = rlen; iocb.buf = buf; ret = sd_store->atomic_put(oid, &iocb); if (ret != SD_RES_SUCCESS) { ret = -1; goto out; } } else { eprintf("failed, res: %"PRIx32"\n", rsp->result); ret = rsp->result; goto out; } done: dprintf("recovered oid %"PRIx64" from %d to epoch %d\n", oid, tgt_epoch, epoch); out: if (ret == SD_RES_SUCCESS) objlist_cache_insert(oid); free(buf); return ret; }
/* * Try our best to read one copy and read local first. * * Return success if any read succeed. We don't call gateway_forward_request() * because we only read once. */ int gateway_read_obj(struct request *req) { int i, ret = SD_RES_SUCCESS; struct sd_req fwd_hdr; struct sd_rsp *rsp = (struct sd_rsp *)&fwd_hdr; const struct sd_vnode *v; const struct sd_vnode *my_v; const struct sd_vnode *obj_vnodes[SD_MAX_COPIES]; const struct sd_vnode *my_obj_vnodes[SD_MAX_COPIES]; uint64_t oid = req->rq.obj.oid; int nr_copies, j; //PRINT_TO_LOG("WYH\n"); if (sys->enable_object_cache && !req->local && !bypass_object_cache(req)) { ret = object_cache_handle_request(req); goto out; } nr_copies = get_req_copy_number(req); if (nr_copies == 0) { sd_debug("there is no living nodes"); return SD_RES_HALT; } oid_to_vnodes(req->vinfo->vnodes, req->vinfo->nr_vnodes, oid, nr_copies, obj_vnodes); //PRINT_TO_LOG("out:%s\n", addr_to_str(obj_vnodes[0]->nid.addr,obj_vnodes[0]->nid.port)); oid_to_vnodes(req->vinfo->my_vnodes, req->vinfo->my_nr_vnodes, oid, nr_copies, my_obj_vnodes); my_v = my_obj_vnodes[0]; //PRINT_TO_LOG("out:%s\n", addr_to_str(my_v->nid.addr,my_v->nid.port)); for (i = 0; i < nr_copies; i++) { v = obj_vnodes[0]; my_v = my_obj_vnodes[0]; //PRINT_TO_LOG("phy:%s\n", addr_to_str(v->nid.addr,v->nid.port)); //PRINT_TO_LOG("vir:%s\n", addr_to_str(my_v->nid.addr, my_v->nid.port)); //PRINT_TO_LOG("%d,%d\n", req->vinfo->nr_vnodes, req->vinfo->my_nr_vnodes); if (!vnode_is_local(v)) continue; ret = peer_read_obj(req); if (ret == SD_RES_SUCCESS) goto out; sd_err("local read %"PRIx64" failed, %s", oid, sd_strerror(ret)); break; } /* * Read random copy from cluster for better load balance, useful for * reading base VM's COW objects */ j = random(); for (i = 0; i < nr_copies; i++) { int idx = (i + j) % nr_copies; v = obj_vnodes[idx]; if (vnode_is_local(v)) continue; /* * We need to re-init it because rsp and req share the same * structure. */ gateway_init_fwd_hdr(&fwd_hdr, &req->rq); ret = sheep_exec_req(&v->nid, &fwd_hdr, req->data); if (ret != SD_RES_SUCCESS) continue; /* Read success */ memcpy(&req->rp, rsp, sizeof(*rsp)); break; } out: if (ret == SD_RES_SUCCESS && req->rq.proto_ver < SD_PROTO_VER_TRIM_ZERO_SECTORS) { /* the client doesn't support trimming zero bytes */ untrim_zero_blocks(req->data, req->rp.obj.offset, req->rp.data_length, req->rq.data_length); req->rp.data_length = req->rq.data_length; req->rp.obj.offset = 0; } return ret; }