static bool oid_stale(uint64_t oid) { int i, nr_copies; struct vnode_info *vinfo; struct sd_vnode *v; bool ret = true; struct sd_vnode *obj_vnodes[SD_MAX_COPIES]; vinfo = get_vnode_info(); nr_copies = get_obj_copy_number(oid, vinfo->nr_zones); if (!nr_copies) { ret = false; goto out; } oid_to_vnodes(vinfo->vnodes, vinfo->nr_vnodes, oid, nr_copies, obj_vnodes); for (i = 0; i < nr_copies; i++) { v = obj_vnodes[i]; if (vnode_is_local(v)) { ret = false; break; } } out: put_vnode_info(vinfo); return ret; }
/* Screen out objects that don't belong to this node */ static void screen_object_list(struct recovery_work *rw, uint64_t *oids, int nr_oids) { struct sd_vnode *vnodes[SD_MAX_COPIES]; int old_count = rw->count; int nr_objs; int i, j; nr_objs = get_nr_copies(rw->cur_vnodes); for (i = 0; i < nr_oids; i++) { oid_to_vnodes(rw->cur_vnodes, oids[i], nr_objs, vnodes); for (j = 0; j < nr_objs; j++) { if (!vnode_is_local(vnodes[j])) continue; if (bsearch(&oids[i], rw->oids, old_count, sizeof(uint64_t), obj_cmp)) continue; rw->oids[rw->count++] = oids[i]; break; } } qsort(rw->oids, rw->count, sizeof(uint64_t), obj_cmp); }
/* * Try our best to read one copy and read local first. * * Return success if any read succeed. We don't call gateway_forward_request() * because we only read once. */ int gateway_read_obj(struct request *req) { int i, ret = SD_RES_SUCCESS; unsigned wlen, rlen; struct sd_req fwd_hdr; struct sd_rsp *rsp = (struct sd_rsp *)&fwd_hdr; struct sd_vnode *v; struct sd_vnode *obj_vnodes[SD_MAX_COPIES]; uint64_t oid = req->rq.obj.oid; int nr_copies, j; if (is_object_cache_enabled() && !req->local && !bypass_object_cache(req)) return object_cache_handle_request(req); nr_copies = get_req_copy_number(req); oid_to_vnodes(req->vinfo->vnodes, req->vinfo->nr_vnodes, oid, nr_copies, obj_vnodes); for (i = 0; i < nr_copies; i++) { v = obj_vnodes[i]; if (!vnode_is_local(v)) continue; ret = peer_read_obj(req); if (ret == SD_RES_SUCCESS) return ret; eprintf("local read fail %x\n", ret); break; } /* * Read random copy from cluster for better load balance, useful for * reading base VM's COW objects */ j = random(); for (i = 0; i < nr_copies; i++) { int idx = (i + j) % nr_copies; v = obj_vnodes[idx]; if (vnode_is_local(v)) continue; /* * We need to re-init it because rsp and req share the same * structure. */ gateway_init_fwd_hdr(&fwd_hdr, &req->rq); wlen = 0; rlen = fwd_hdr.data_length; ret = sheep_exec_req(&v->nid, &fwd_hdr, req->data, &wlen, &rlen); if (ret != SD_RES_SUCCESS) continue; /* Read success */ memcpy(&req->rp, rsp, sizeof(*rsp)); break; } return ret; }
/* Screen out objects that don't belong to this node */ static void screen_object_list(struct recovery_work *rw, uint64_t *oids, int nr_oids) { struct sd_vnode *vnodes[SD_MAX_COPIES]; int old_count = rw->count; int nr_objs; int i, j; for (i = 0; i < nr_oids; i++) { again: nr_objs = get_obj_copy_number(oids[i], rw->cur_vinfo->nr_zones); if (!nr_objs) { dprintf("can not find copy number for object %" PRIx64 "\n", oids[i]); dprintf("probably, vdi was created but " "post_cluster_new_vdi() is not called yet\n"); /* FIXME: can we wait for post_cluster_new_vdi * with a better way? */ sleep(1); goto again; } oid_to_vnodes(rw->cur_vinfo->vnodes, rw->cur_vinfo->nr_vnodes, oids[i], nr_objs, vnodes); for (j = 0; j < nr_objs; j++) { if (!vnode_is_local(vnodes[j])) continue; if (bsearch(&oids[i], rw->oids, old_count, sizeof(uint64_t), obj_cmp)) continue; rw->oids[rw->count++] = oids[i]; break; } } qsort(rw->oids, rw->count, sizeof(uint64_t), obj_cmp); }
/* * For replicated object, if any of the replica belongs to this node, we * consider it not stale. * * For erasured object, since every copy is unique and if it migrates to other * node(index gets changed even it has some other copy belongs to it) because * of hash ring changes, we consider it stale. */ static bool oid_stale(uint64_t oid, int ec_index, struct vnode_info *vinfo) { uint32_t i, nr_copies; const struct sd_vnode *v; bool ret = true; const struct sd_vnode *obj_vnodes[SD_MAX_COPIES]; nr_copies = get_obj_copy_number(oid, vinfo->nr_zones); oid_to_vnodes(oid, &vinfo->vroot, nr_copies, obj_vnodes); for (i = 0; i < nr_copies; i++) { v = obj_vnodes[i]; if (vnode_is_local(v)) { if (ec_index < SD_MAX_COPIES) { if (i == ec_index) ret = false; } else { ret = false; } break; } } return ret; }
/* Screen out objects that don't belong to this node */ static void screen_object_list(struct recovery_work *rw, uint64_t *oids, size_t nr_oids) { const struct sd_vnode *vnodes[SD_MAX_COPIES]; int old_count = rw->count; int nr_objs; int i, j; for (i = 0; i < nr_oids; i++) { nr_objs = get_obj_copy_number(oids[i], rw->cur_vinfo->nr_zones); if (!nr_objs) { sd_eprintf("ERROR: can not find copy number for object" " %" PRIx64, oids[i]); continue; } oid_to_vnodes(rw->cur_vinfo->vnodes, rw->cur_vinfo->nr_vnodes, oids[i], nr_objs, vnodes); for (j = 0; j < nr_objs; j++) { if (!vnode_is_local(vnodes[j])) continue; if (bsearch(&oids[i], rw->oids, old_count, sizeof(uint64_t), obj_cmp)) continue; rw->oids[rw->count++] = oids[i]; /* enlarge the list buffer if full */ if (rw->count == list_buffer_size / sizeof(uint64_t)) { list_buffer_size *= 2; rw->oids = xrealloc(rw->oids, list_buffer_size); } break; } } qsort(rw->oids, rw->count, sizeof(uint64_t), obj_cmp); }
/* * Try our best to read one copy and read local first. * * Return success if any read succeed. We don't call gateway_forward_request() * because we only read once. */ int gateway_read_obj(struct request *req) { int i, ret = SD_RES_SUCCESS; struct sd_req fwd_hdr; struct sd_rsp *rsp = (struct sd_rsp *)&fwd_hdr; const struct sd_vnode *v; const struct sd_vnode *my_v; const struct sd_vnode *obj_vnodes[SD_MAX_COPIES]; const struct sd_vnode *my_obj_vnodes[SD_MAX_COPIES]; uint64_t oid = req->rq.obj.oid; int nr_copies, j; //PRINT_TO_LOG("WYH\n"); if (sys->enable_object_cache && !req->local && !bypass_object_cache(req)) { ret = object_cache_handle_request(req); goto out; } nr_copies = get_req_copy_number(req); if (nr_copies == 0) { sd_debug("there is no living nodes"); return SD_RES_HALT; } oid_to_vnodes(req->vinfo->vnodes, req->vinfo->nr_vnodes, oid, nr_copies, obj_vnodes); //PRINT_TO_LOG("out:%s\n", addr_to_str(obj_vnodes[0]->nid.addr,obj_vnodes[0]->nid.port)); oid_to_vnodes(req->vinfo->my_vnodes, req->vinfo->my_nr_vnodes, oid, nr_copies, my_obj_vnodes); my_v = my_obj_vnodes[0]; //PRINT_TO_LOG("out:%s\n", addr_to_str(my_v->nid.addr,my_v->nid.port)); for (i = 0; i < nr_copies; i++) { v = obj_vnodes[0]; my_v = my_obj_vnodes[0]; //PRINT_TO_LOG("phy:%s\n", addr_to_str(v->nid.addr,v->nid.port)); //PRINT_TO_LOG("vir:%s\n", addr_to_str(my_v->nid.addr, my_v->nid.port)); //PRINT_TO_LOG("%d,%d\n", req->vinfo->nr_vnodes, req->vinfo->my_nr_vnodes); if (!vnode_is_local(v)) continue; ret = peer_read_obj(req); if (ret == SD_RES_SUCCESS) goto out; sd_err("local read %"PRIx64" failed, %s", oid, sd_strerror(ret)); break; } /* * Read random copy from cluster for better load balance, useful for * reading base VM's COW objects */ j = random(); for (i = 0; i < nr_copies; i++) { int idx = (i + j) % nr_copies; v = obj_vnodes[idx]; if (vnode_is_local(v)) continue; /* * We need to re-init it because rsp and req share the same * structure. */ gateway_init_fwd_hdr(&fwd_hdr, &req->rq); ret = sheep_exec_req(&v->nid, &fwd_hdr, req->data); if (ret != SD_RES_SUCCESS) continue; /* Read success */ memcpy(&req->rp, rsp, sizeof(*rsp)); break; } out: if (ret == SD_RES_SUCCESS && req->rq.proto_ver < SD_PROTO_VER_TRIM_ZERO_SECTORS) { /* the client doesn't support trimming zero bytes */ untrim_zero_blocks(req->data, req->rp.obj.offset, req->rp.data_length, req->rq.data_length); req->rp.data_length = req->rq.data_length; req->rp.obj.offset = 0; } return ret; }