Пример #1
0
static bool oid_stale(uint64_t oid)
{
	int i, nr_copies;
	struct vnode_info *vinfo;
	struct sd_vnode *v;
	bool ret = true;
	struct sd_vnode *obj_vnodes[SD_MAX_COPIES];

	vinfo = get_vnode_info();
	nr_copies = get_obj_copy_number(oid, vinfo->nr_zones);
	if (!nr_copies) {
		ret = false;
		goto out;
	}

	oid_to_vnodes(vinfo->vnodes, vinfo->nr_vnodes, oid,
		      nr_copies, obj_vnodes);
	for (i = 0; i < nr_copies; i++) {
		v = obj_vnodes[i];
		if (vnode_is_local(v)) {
			ret = false;
			break;
		}
	}
out:
	put_vnode_info(vinfo);
	return ret;
}
Пример #2
0
/* Screen out objects that don't belong to this node */
static void screen_object_list(struct recovery_work *rw,
			       uint64_t *oids, int nr_oids)
{
	struct sd_vnode *vnodes[SD_MAX_COPIES];
	int old_count = rw->count;
	int nr_objs;
	int i, j;

	nr_objs = get_nr_copies(rw->cur_vnodes);
	for (i = 0; i < nr_oids; i++) {
		oid_to_vnodes(rw->cur_vnodes, oids[i], nr_objs, vnodes);
		for (j = 0; j < nr_objs; j++) {
			if (!vnode_is_local(vnodes[j]))
				continue;
			if (bsearch(&oids[i], rw->oids, old_count,
				    sizeof(uint64_t), obj_cmp))
				continue;

			rw->oids[rw->count++] = oids[i];
			break;
		}
	}

	qsort(rw->oids, rw->count, sizeof(uint64_t), obj_cmp);
}
Пример #3
0
/*
 * Try our best to read one copy and read local first.
 *
 * Return success if any read succeed. We don't call gateway_forward_request()
 * because we only read once.
 */
int gateway_read_obj(struct request *req)
{
	int i, ret = SD_RES_SUCCESS;
	unsigned wlen, rlen;
	struct sd_req fwd_hdr;
	struct sd_rsp *rsp = (struct sd_rsp *)&fwd_hdr;
	struct sd_vnode *v;
	struct sd_vnode *obj_vnodes[SD_MAX_COPIES];
	uint64_t oid = req->rq.obj.oid;
	int nr_copies, j;

	if (is_object_cache_enabled() && !req->local && !bypass_object_cache(req))
		return object_cache_handle_request(req);

	nr_copies = get_req_copy_number(req);
	oid_to_vnodes(req->vinfo->vnodes, req->vinfo->nr_vnodes, oid,
		      nr_copies, obj_vnodes);
	for (i = 0; i < nr_copies; i++) {
		v = obj_vnodes[i];
		if (!vnode_is_local(v))
			continue;
		ret = peer_read_obj(req);
		if (ret == SD_RES_SUCCESS)
			return ret;

		eprintf("local read fail %x\n", ret);
		break;
	}

	/*
	 * Read random copy from cluster for better load balance, useful for
	 * reading base VM's COW objects
	 */
	j = random();
	for (i = 0; i < nr_copies; i++) {
		int idx = (i + j) % nr_copies;

		v = obj_vnodes[idx];
		if (vnode_is_local(v))
			continue;
		/*
		 * We need to re-init it because rsp and req share the same
		 * structure.
		 */
		gateway_init_fwd_hdr(&fwd_hdr, &req->rq);
		wlen = 0;
		rlen = fwd_hdr.data_length;
		ret = sheep_exec_req(&v->nid, &fwd_hdr, req->data, &wlen,
				     &rlen);
		if (ret != SD_RES_SUCCESS)
			continue;

		/* Read success */
		memcpy(&req->rp, rsp, sizeof(*rsp));
		break;
	}
	return ret;
}
Пример #4
0
/* Screen out objects that don't belong to this node */
static void screen_object_list(struct recovery_work *rw,
			       uint64_t *oids, int nr_oids)
{
	struct sd_vnode *vnodes[SD_MAX_COPIES];
	int old_count = rw->count;
	int nr_objs;
	int i, j;

	for (i = 0; i < nr_oids; i++) {
again:
		nr_objs = get_obj_copy_number(oids[i], rw->cur_vinfo->nr_zones);
		if (!nr_objs) {
			dprintf("can not find copy number for object %" PRIx64
				"\n", oids[i]);
			dprintf("probably, vdi was created but "
				"post_cluster_new_vdi() is not called yet\n");
			/* FIXME: can we wait for post_cluster_new_vdi
			 *        with a better way? */
			sleep(1);
			goto again;
		}
		oid_to_vnodes(rw->cur_vinfo->vnodes, rw->cur_vinfo->nr_vnodes,
			      oids[i], nr_objs, vnodes);
		for (j = 0; j < nr_objs; j++) {
			if (!vnode_is_local(vnodes[j]))
				continue;
			if (bsearch(&oids[i], rw->oids, old_count,
				    sizeof(uint64_t), obj_cmp))
				continue;

			rw->oids[rw->count++] = oids[i];
			break;
		}
	}

	qsort(rw->oids, rw->count, sizeof(uint64_t), obj_cmp);
}
Пример #5
0
/*
 * For replicated object, if any of the replica belongs to this node, we
 * consider it not stale.
 *
 * For erasured object, since every copy is unique and if it migrates to other
 * node(index gets changed even it has some other copy belongs to it) because
 * of hash ring changes, we consider it stale.
 */
static bool oid_stale(uint64_t oid, int ec_index, struct vnode_info *vinfo)
{
	uint32_t i, nr_copies;
	const struct sd_vnode *v;
	bool ret = true;
	const struct sd_vnode *obj_vnodes[SD_MAX_COPIES];

	nr_copies = get_obj_copy_number(oid, vinfo->nr_zones);
	oid_to_vnodes(oid, &vinfo->vroot, nr_copies, obj_vnodes);
	for (i = 0; i < nr_copies; i++) {
		v = obj_vnodes[i];
		if (vnode_is_local(v)) {
			if (ec_index < SD_MAX_COPIES) {
				if (i == ec_index)
					ret = false;
			} else {
				ret = false;
			}
			break;
		}
	}

	return ret;
}
Пример #6
0
/* Screen out objects that don't belong to this node */
static void screen_object_list(struct recovery_work *rw,
			       uint64_t *oids, size_t nr_oids)
{
	const struct sd_vnode *vnodes[SD_MAX_COPIES];
	int old_count = rw->count;
	int nr_objs;
	int i, j;

	for (i = 0; i < nr_oids; i++) {
		nr_objs = get_obj_copy_number(oids[i], rw->cur_vinfo->nr_zones);
		if (!nr_objs) {
			sd_eprintf("ERROR: can not find copy number for object"
				   " %" PRIx64, oids[i]);
			continue;
		}
		oid_to_vnodes(rw->cur_vinfo->vnodes, rw->cur_vinfo->nr_vnodes,
			      oids[i], nr_objs, vnodes);
		for (j = 0; j < nr_objs; j++) {
			if (!vnode_is_local(vnodes[j]))
				continue;
			if (bsearch(&oids[i], rw->oids, old_count,
				    sizeof(uint64_t), obj_cmp))
				continue;

			rw->oids[rw->count++] = oids[i];
			/* enlarge the list buffer if full */
			if (rw->count == list_buffer_size / sizeof(uint64_t)) {
				list_buffer_size *= 2;
				rw->oids = xrealloc(rw->oids, list_buffer_size);
			}
			break;
		}
	}

	qsort(rw->oids, rw->count, sizeof(uint64_t), obj_cmp);
}
Пример #7
0
/*
 * Try our best to read one copy and read local first.
 *
 * Return success if any read succeed. We don't call gateway_forward_request()
 * because we only read once.
 */
int gateway_read_obj(struct request *req)
{
	int i, ret = SD_RES_SUCCESS;
	struct sd_req fwd_hdr;
	struct sd_rsp *rsp = (struct sd_rsp *)&fwd_hdr;
	const struct sd_vnode *v;
	const struct sd_vnode *my_v;
	const struct sd_vnode *obj_vnodes[SD_MAX_COPIES];
	const struct sd_vnode *my_obj_vnodes[SD_MAX_COPIES];
	uint64_t oid = req->rq.obj.oid;
	int nr_copies, j;
	
	//PRINT_TO_LOG("WYH\n");
	if (sys->enable_object_cache && !req->local &&
	    !bypass_object_cache(req)) {
		ret = object_cache_handle_request(req);
		goto out;
	}

	nr_copies = get_req_copy_number(req);

	if (nr_copies == 0) {
		sd_debug("there is no living nodes");
		return SD_RES_HALT;
	}

	oid_to_vnodes(req->vinfo->vnodes, req->vinfo->nr_vnodes, oid,
		      nr_copies, obj_vnodes);
	//PRINT_TO_LOG("out:%s\n", addr_to_str(obj_vnodes[0]->nid.addr,obj_vnodes[0]->nid.port));
	
	oid_to_vnodes(req->vinfo->my_vnodes, req->vinfo->my_nr_vnodes, oid, nr_copies, my_obj_vnodes);
	my_v = my_obj_vnodes[0];
	//PRINT_TO_LOG("out:%s\n", addr_to_str(my_v->nid.addr,my_v->nid.port));
		
	for (i = 0; i < nr_copies; i++) {
		v = obj_vnodes[0];
		my_v = my_obj_vnodes[0];
		
		//PRINT_TO_LOG("phy:%s\n", addr_to_str(v->nid.addr,v->nid.port));
		//PRINT_TO_LOG("vir:%s\n", addr_to_str(my_v->nid.addr, my_v->nid.port));
		
		//PRINT_TO_LOG("%d,%d\n", req->vinfo->nr_vnodes, req->vinfo->my_nr_vnodes);

		if (!vnode_is_local(v))
			continue;
		ret = peer_read_obj(req);
		if (ret == SD_RES_SUCCESS)
			goto out;

		sd_err("local read %"PRIx64" failed, %s", oid,
		       sd_strerror(ret));
		break;
	}

	/*
	 * Read random copy from cluster for better load balance, useful for
	 * reading base VM's COW objects
	 */
	j = random();
	for (i = 0; i < nr_copies; i++) {
		int idx = (i + j) % nr_copies;

		v = obj_vnodes[idx];
		if (vnode_is_local(v))
			continue;
		/*
		 * We need to re-init it because rsp and req share the same
		 * structure.
		 */
		gateway_init_fwd_hdr(&fwd_hdr, &req->rq);
		ret = sheep_exec_req(&v->nid, &fwd_hdr, req->data);
		if (ret != SD_RES_SUCCESS)
			continue;

		/* Read success */
		memcpy(&req->rp, rsp, sizeof(*rsp));
		break;
	}
out:
	if (ret == SD_RES_SUCCESS &&
	    req->rq.proto_ver < SD_PROTO_VER_TRIM_ZERO_SECTORS) {
		/* the client doesn't support trimming zero bytes */
		untrim_zero_blocks(req->data, req->rp.obj.offset,
				   req->rp.data_length, req->rq.data_length);
		req->rp.data_length = req->rq.data_length;
		req->rp.obj.offset = 0;
	}
	return ret;
}