static int get_vdis_from(struct sd_node *node) { struct sd_req hdr; struct sd_rsp *rsp = (struct sd_rsp *)&hdr; struct vdi_copy *vc = NULL; int i, ret = SD_RES_SUCCESS; unsigned int rlen; int count; if (node_is_local(node)) goto out; rlen = SD_DATA_OBJ_SIZE; /* FIXME */ vc = zalloc(rlen); if (!vc) { sd_printf(SDOG_ERR, "unable to allocate memory\n"); ret = SD_RES_NO_MEM; goto out; } sd_init_req(&hdr, SD_OP_GET_VDI_COPIES); hdr.data_length = rlen; ret = sheep_exec_req(&node->nid, &hdr, (char *)vc); if (ret != SD_RES_SUCCESS) goto out; count = rsp->data_length / sizeof(*vc); for (i = 0; i < count; i++) { set_bit(vc[i].vid, sys->vdi_inuse); add_vdi_copy_number(vc[i].vid, vc[i].nr_copies); } out: free(vc); return ret; }
/* * Try our best to read one copy and read local first. * * Return success if any read succeed. We don't call gateway_forward_request() * because we only read once. */ int gateway_read_obj(struct request *req) { int i, ret = SD_RES_SUCCESS; unsigned wlen, rlen; struct sd_req fwd_hdr; struct sd_rsp *rsp = (struct sd_rsp *)&fwd_hdr; struct sd_vnode *v; struct sd_vnode *obj_vnodes[SD_MAX_COPIES]; uint64_t oid = req->rq.obj.oid; int nr_copies, j; if (is_object_cache_enabled() && !req->local && !bypass_object_cache(req)) return object_cache_handle_request(req); nr_copies = get_req_copy_number(req); oid_to_vnodes(req->vinfo->vnodes, req->vinfo->nr_vnodes, oid, nr_copies, obj_vnodes); for (i = 0; i < nr_copies; i++) { v = obj_vnodes[i]; if (!vnode_is_local(v)) continue; ret = peer_read_obj(req); if (ret == SD_RES_SUCCESS) return ret; eprintf("local read fail %x\n", ret); break; } /* * Read random copy from cluster for better load balance, useful for * reading base VM's COW objects */ j = random(); for (i = 0; i < nr_copies; i++) { int idx = (i + j) % nr_copies; v = obj_vnodes[idx]; if (vnode_is_local(v)) continue; /* * We need to re-init it because rsp and req share the same * structure. */ gateway_init_fwd_hdr(&fwd_hdr, &req->rq); wlen = 0; rlen = fwd_hdr.data_length; ret = sheep_exec_req(&v->nid, &fwd_hdr, req->data, &wlen, &rlen); if (ret != SD_RES_SUCCESS) continue; /* Read success */ memcpy(&req->rp, rsp, sizeof(*rsp)); break; } return ret; }
static int recover_object_from_replica(uint64_t oid, const struct sd_vnode *vnode, uint32_t epoch, uint32_t tgt_epoch) { struct sd_req hdr; struct sd_rsp *rsp = (struct sd_rsp *)&hdr; unsigned rlen; int ret = SD_RES_NO_MEM; void *buf = NULL; struct siocb iocb = { 0 }; if (vnode_is_local(vnode)) { ret = sd_store->link(oid, tgt_epoch); goto out; } rlen = get_objsize(oid); buf = valloc(rlen); if (!buf) { sd_eprintf("%m"); goto out; } sd_init_req(&hdr, SD_OP_READ_PEER); hdr.epoch = epoch; hdr.flags = SD_FLAG_CMD_RECOVERY; hdr.data_length = rlen; hdr.obj.oid = oid; hdr.obj.tgt_epoch = tgt_epoch; ret = sheep_exec_req(&vnode->nid, &hdr, buf); if (ret != SD_RES_SUCCESS) goto out; iocb.epoch = epoch; iocb.length = rsp->data_length; iocb.offset = rsp->obj.offset; iocb.buf = buf; ret = sd_store->create_and_write(oid, &iocb); out: if (ret == SD_RES_SUCCESS) { sd_dprintf("recovered oid %"PRIx64" from %d to epoch %d", oid, tgt_epoch, epoch); objlist_cache_insert(oid); } free(buf); return ret; }
/* Fetch the object list from all the nodes in the cluster */ static uint64_t *fetch_object_list(struct sd_node *e, uint32_t epoch, size_t *nr_oids) { char name[128]; struct sd_list_req hdr; struct sd_list_rsp *rsp = (struct sd_list_rsp *)&hdr; size_t buf_size = list_buffer_size; uint64_t *buf = xmalloc(buf_size); int ret; addr_to_str(name, sizeof(name), e->nid.addr, 0); sd_dprintf("%s %"PRIu32, name, e->nid.port); retry: sd_init_req((struct sd_req *)&hdr, SD_OP_GET_OBJ_LIST); hdr.tgt_epoch = epoch - 1; hdr.data_length = buf_size; hdr.epoch = sys_epoch(); ret = sheep_exec_req(&e->nid, (struct sd_req *)&hdr, buf); switch (ret) { case SD_RES_SUCCESS: break; case SD_RES_BUFFER_SMALL: buf_size *= 2; buf = xrealloc(buf, buf_size); goto retry; default: free(buf); return NULL; } *nr_oids = rsp->data_length / sizeof(uint64_t); sd_dprintf("%zu", *nr_oids); return buf; }
/* * Try our best to read one copy and read local first. * * Return success if any read succeed. We don't call gateway_forward_request() * because we only read once. */ int gateway_read_obj(struct request *req) { int i, ret = SD_RES_SUCCESS; struct sd_req fwd_hdr; struct sd_rsp *rsp = (struct sd_rsp *)&fwd_hdr; const struct sd_vnode *v; const struct sd_vnode *my_v; const struct sd_vnode *obj_vnodes[SD_MAX_COPIES]; const struct sd_vnode *my_obj_vnodes[SD_MAX_COPIES]; uint64_t oid = req->rq.obj.oid; int nr_copies, j; //PRINT_TO_LOG("WYH\n"); if (sys->enable_object_cache && !req->local && !bypass_object_cache(req)) { ret = object_cache_handle_request(req); goto out; } nr_copies = get_req_copy_number(req); if (nr_copies == 0) { sd_debug("there is no living nodes"); return SD_RES_HALT; } oid_to_vnodes(req->vinfo->vnodes, req->vinfo->nr_vnodes, oid, nr_copies, obj_vnodes); //PRINT_TO_LOG("out:%s\n", addr_to_str(obj_vnodes[0]->nid.addr,obj_vnodes[0]->nid.port)); oid_to_vnodes(req->vinfo->my_vnodes, req->vinfo->my_nr_vnodes, oid, nr_copies, my_obj_vnodes); my_v = my_obj_vnodes[0]; //PRINT_TO_LOG("out:%s\n", addr_to_str(my_v->nid.addr,my_v->nid.port)); for (i = 0; i < nr_copies; i++) { v = obj_vnodes[0]; my_v = my_obj_vnodes[0]; //PRINT_TO_LOG("phy:%s\n", addr_to_str(v->nid.addr,v->nid.port)); //PRINT_TO_LOG("vir:%s\n", addr_to_str(my_v->nid.addr, my_v->nid.port)); //PRINT_TO_LOG("%d,%d\n", req->vinfo->nr_vnodes, req->vinfo->my_nr_vnodes); if (!vnode_is_local(v)) continue; ret = peer_read_obj(req); if (ret == SD_RES_SUCCESS) goto out; sd_err("local read %"PRIx64" failed, %s", oid, sd_strerror(ret)); break; } /* * Read random copy from cluster for better load balance, useful for * reading base VM's COW objects */ j = random(); for (i = 0; i < nr_copies; i++) { int idx = (i + j) % nr_copies; v = obj_vnodes[idx]; if (vnode_is_local(v)) continue; /* * We need to re-init it because rsp and req share the same * structure. */ gateway_init_fwd_hdr(&fwd_hdr, &req->rq); ret = sheep_exec_req(&v->nid, &fwd_hdr, req->data); if (ret != SD_RES_SUCCESS) continue; /* Read success */ memcpy(&req->rp, rsp, sizeof(*rsp)); break; } out: if (ret == SD_RES_SUCCESS && req->rq.proto_ver < SD_PROTO_VER_TRIM_ZERO_SECTORS) { /* the client doesn't support trimming zero bytes */ untrim_zero_blocks(req->data, req->rp.obj.offset, req->rp.data_length, req->rq.data_length); req->rp.data_length = req->rq.data_length; req->rp.obj.offset = 0; } return ret; }