示例#1
0
void
as_record_drop_stats(as_record* r, as_namespace* ns)
{
	as_namespace_release_set_id(ns, as_index_get_set_id(r));

	cf_atomic64_decr(&ns->n_objects);
}
int
delete_replica(as_partition_reservation* rsv, cf_digest* keyd, bool is_subrec,
		bool is_nsup_delete, bool is_xdr_op, cf_node master)
{
	// Shortcut pointers & flags.
	as_namespace* ns = rsv->ns;
	as_index_tree* tree = is_subrec ? rsv->sub_tree : rsv->tree;

	as_index_ref r_ref;
	r_ref.skip_lock = false;

	if (as_record_get(tree, keyd, &r_ref, ns) != 0) {
		return AS_PROTO_RESULT_FAIL_NOTFOUND;
	}

	as_record* r = r_ref.r;

	if (ns->storage_data_in_memory) {
		as_storage_rd rd;
		as_storage_record_open(ns, r, &rd, keyd);
		delete_adjust_sindex(&rd);
		as_storage_record_close(r, &rd);
	}

	// Save the set-ID and generation for XDR.
	uint16_t set_id = as_index_get_set_id(r);
	uint16_t generation = r->generation;

	as_index_delete(tree, keyd);
	as_record_done(&r_ref, ns);

	if (xdr_must_ship_delete(ns, is_nsup_delete, is_xdr_op)) {
		xdr_write(ns, *keyd, generation, master, true, set_id, NULL);
	}

	return AS_PROTO_RESULT_OK;
}
示例#3
0
int as_msg_make_response_bufbuilder(as_record *r, as_storage_rd *rd,
		cf_buf_builder **bb_r, bool nobindata, char *nsname, bool use_sets,
		bool include_key, cf_vector *binlist)
{
	// Sanity checks. Either rd should be there or nobindata and nsname should be present.
	if (!(rd || (nobindata && nsname))) {
		cf_detail(AS_PROTO, "Neither storage record nor nobindata is set. Skipping the record.");
		return 0;
	}

	// figure out the size of the entire buffer
	int         set_name_len = 0;
	const char *set_name     = NULL;
	int         ns_len       = rd ? strlen(rd->ns->name) : strlen(nsname);

	if (use_sets && as_index_get_set_id(r) != INVALID_SET_ID) {
		as_namespace *ns = NULL;

		if (rd) {
			ns = rd->ns;
		} else if (nsname) {
			ns = as_namespace_get_byname(nsname);
		}
		if (!ns) {
			cf_info(AS_PROTO, "Cannot get namespace, needed to get set information. Skipping record.");
			return -1;
		}
		set_name = as_index_get_set_name(r, ns);
		if (set_name) {
			set_name_len = strlen(set_name);
		}
	}

	uint8_t* key = NULL;
	uint32_t key_size = 0;

	if (include_key && as_index_is_flag_set(r, AS_INDEX_FLAG_KEY_STORED)) {
		if (! as_storage_record_get_key(rd)) {
			cf_info(AS_PROTO, "can't get key - skipping record");
			return -1;
		}

		key = rd->key;
		key_size = rd->key_size;
	}

	uint16_t n_fields = 2;
	int msg_sz = sizeof(as_msg);
	msg_sz += sizeof(as_msg_field) + sizeof(cf_digest);
	msg_sz += sizeof(as_msg_field) + ns_len;
	if (set_name) {
		n_fields++;
		msg_sz += sizeof(as_msg_field) + set_name_len;
	}
	if (key) {
		n_fields++;
		msg_sz += sizeof(as_msg_field) + key_size;
	}

	int list_bins   = 0;
	int in_use_bins = 0;
	if (rd) {
		in_use_bins = as_bin_inuse_count(rd);
	}

	if (nobindata == false) {
		if(binlist) {
			int binlist_sz = cf_vector_size(binlist);
			for(uint16_t i = 0; i < binlist_sz; i++) {
				char binname[AS_ID_BIN_SZ];
				cf_vector_get(binlist, i, (void*)&binname);
				cf_debug(AS_PROTO, " Binname projected inside is |%s| \n", binname);
				as_bin *p_bin = as_bin_get (rd, (uint8_t*)binname, strlen(binname));
				if (!p_bin)
				{
					cf_debug(AS_PROTO, "To be projected bin |%s| not found \n", binname);
					continue;
				}
				cf_debug(AS_PROTO, "Adding bin |%s| to projected bins |%s| \n", binname);
				list_bins++;
				msg_sz += sizeof(as_msg_op);
				msg_sz += rd->ns->single_bin ? 0 : strlen(binname);
				uint32_t psz;
				if (as_bin_is_hidden(p_bin)) {
					psz = 0;
				} else {
					as_particle_tobuf(p_bin, 0, &psz); // get size
				}
				msg_sz += psz;
			}
		}
		else {
			msg_sz += sizeof(as_msg_op) * in_use_bins; // the bin headers
			for (uint16_t i = 0; i < in_use_bins; i++) {
				as_bin *p_bin = &rd->bins[i];
				msg_sz += rd->ns->single_bin ? 0 : strlen(as_bin_get_name_from_id(rd->ns, p_bin->id));
				uint32_t psz;
				if (as_bin_is_hidden(p_bin)) {
					psz = 0;
				} else {
					as_particle_tobuf(p_bin, 0, &psz); // get size
				}
				msg_sz += psz;
			}
		}
	}

	uint8_t *b;
	cf_buf_builder_reserve(bb_r, msg_sz, &b);

	// set up the header
	uint8_t *buf = b;
	as_msg *msgp = (as_msg *) buf;

	msgp->header_sz = sizeof(as_msg);
	msgp->info1 = (nobindata ? AS_MSG_INFO1_GET_NOBINDATA : 0);
	msgp->info2 = 0;
	msgp->info3 = 0;
	msgp->unused = 0;
	msgp->result_code = 0;
	msgp->generation = r->generation;
	msgp->record_ttl = r->void_time;
	msgp->transaction_ttl = 0;
	msgp->n_fields = n_fields;
	if (rd) {
		if (binlist)
			msgp->n_ops = list_bins;
		else
			msgp->n_ops = in_use_bins;
	} else {
		msgp->n_ops = 0;
	}
	as_msg_swap_header(msgp);

	buf += sizeof(as_msg);

	as_msg_field *mf = (as_msg_field *) buf;
	mf->field_sz = sizeof(cf_digest) + 1;
	mf->type = AS_MSG_FIELD_TYPE_DIGEST_RIPE;
	if (rd) {
		memcpy(mf->data, &rd->keyd, sizeof(cf_digest));
	} else {
		memcpy(mf->data, &r->key, sizeof(cf_digest));
	}
	as_msg_swap_field(mf);
	buf += sizeof(as_msg_field) + sizeof(cf_digest);

	mf = (as_msg_field *) buf;
	mf->field_sz = ns_len + 1;
	mf->type = AS_MSG_FIELD_TYPE_NAMESPACE;
	if (rd) {
		memcpy(mf->data, rd->ns->name, ns_len);
	} else {
		memcpy(mf->data, nsname, ns_len);
	}
	as_msg_swap_field(mf);
	buf += sizeof(as_msg_field) + ns_len;

	if (set_name) {
		mf = (as_msg_field *) buf;
		mf->field_sz = set_name_len + 1;
		mf->type = AS_MSG_FIELD_TYPE_SET;
		memcpy(mf->data, set_name, set_name_len);
		as_msg_swap_field(mf);
		buf += sizeof(as_msg_field) + set_name_len;
	}

	if (key) {
		mf = (as_msg_field *) buf;
		mf->field_sz = key_size + 1;
		mf->type = AS_MSG_FIELD_TYPE_KEY;
		memcpy(mf->data, key, key_size);
		as_msg_swap_field(mf);
		buf += sizeof(as_msg_field) + key_size;
	}

	if (nobindata) {
		goto Out;
	}

	if(binlist) {
		int binlist_sz = cf_vector_size(binlist);
		for(uint16_t i = 0; i < binlist_sz; i++) {

			char binname[AS_ID_BIN_SZ];
			cf_vector_get(binlist, i, (void*)&binname);
			cf_debug(AS_PROTO, " Binname projected inside is |%s| \n", binname);
			as_bin *p_bin = as_bin_get (rd, (uint8_t*)binname, strlen(binname));
			if (!p_bin) // should it be checked before ???
				continue;

			as_msg_op *op = (as_msg_op *)buf;
			buf += sizeof(as_msg_op);

			op->op = AS_MSG_OP_READ;

			op->name_sz = as_bin_memcpy_name(rd->ns, op->name, p_bin);
			buf += op->name_sz;

			// Since there are two variable bits, the size is everything after
			// the data bytes - and this is only the head, we're patching up
			// the rest in a minute.
			op->op_sz = 4 + op->name_sz;

			if (as_bin_inuse(p_bin)) {
				op->particle_type = as_particle_type_convert(as_bin_get_particle_type(p_bin));
				op->version = as_bin_get_version(p_bin, rd->ns->single_bin);

				uint32_t psz = msg_sz - (buf - b); // size remaining in buffer, for safety
				if (as_bin_is_hidden(p_bin)) {
                	op->particle_type = AS_PARTICLE_TYPE_NULL;
					psz = 0;
				} else {
					if (0 != as_particle_tobuf(p_bin, buf, &psz)) {
						cf_warning(AS_PROTO, "particle to buf: could not copy data!");
					}
				}
				buf += psz;
				op->op_sz += psz;
			}
			else {
				cf_debug(AS_PROTO, "Whoops !! bin not in use");
				op->particle_type = AS_PARTICLE_TYPE_NULL;
			}
			as_msg_swap_op(op);
		}
	}
	else {
		// over all bins, copy into the buffer
		for (uint16_t i = 0; i < in_use_bins; i++) {

			as_msg_op *op = (as_msg_op *)buf;
			buf += sizeof(as_msg_op);

			op->op = AS_MSG_OP_READ;

			op->name_sz = as_bin_memcpy_name(rd->ns, op->name, &rd->bins[i]);
			buf += op->name_sz;

			// Since there are two variable bits, the size is everything after
			// the data bytes - and this is only the head, we're patching up
			// the rest in a minute.
			op->op_sz = 4 + op->name_sz;

			if (as_bin_inuse(&rd->bins[i])) {
				op->particle_type = as_particle_type_convert(as_bin_get_particle_type(&rd->bins[i]));
				op->version = as_bin_get_version(&rd->bins[i], rd->ns->single_bin);

				uint32_t psz = msg_sz - (buf - b); // size remaining in buffer, for safety
				if (as_bin_is_hidden(&rd->bins[i])) {
                	op->particle_type = AS_PARTICLE_TYPE_NULL;
					psz = 0;
				} else {
					if (0 != as_particle_tobuf(&rd->bins[i], buf, &psz)) {
						cf_warning(AS_PROTO, "particle to buf: could not copy data!");
					}
				}
				buf += psz;
				op->op_sz += psz;
			}
			else {
				op->particle_type = AS_PARTICLE_TYPE_NULL;
			}
			as_msg_swap_op(op);
		}
	}
Out:
	return(0);
}
示例#4
0
// NB: this uses the same logic as the bufbuild function
// as_msg_make_response_bufbuilder() but does not build a buffer and simply
// returns sizing information.  This is required for query runtime memory
// accounting.
// returns -1 in case of error
// otherwise returns resize value
size_t as_msg_response_msgsize(as_record *r, as_storage_rd *rd, bool nobindata,
		char *nsname, bool use_sets, cf_vector *binlist)
{

	// Sanity checks. Either rd should be there or nobindata and nsname should be present.
	if (!(rd || (nobindata && nsname))) {
		cf_detail(AS_PROTO, "Neither storage record nor nobindata is set. Skipping the record.");
		return -1;
	}

	// figure out the size of the entire buffer
	int         set_name_len = 0;
	const char *set_name     = NULL;
	int         ns_len       = rd ? strlen(rd->ns->name) : strlen(nsname);

	if (use_sets && as_index_get_set_id(r) != INVALID_SET_ID) {
		as_namespace *ns = NULL;

		if (rd) {
			ns = rd->ns;
		} else if (nsname) {
			ns = as_namespace_get_byname(nsname);
		}
		if (!ns) {
			cf_info(AS_PROTO, "Cannot get namespace, needed to get set information. Skipping record.");
			return -1;
		}
		set_name = as_index_get_set_name(r, ns);
		if (set_name) {
			set_name_len = strlen(set_name);
		}
	}

	int msg_sz = sizeof(as_msg);
	msg_sz += sizeof(as_msg_field) + sizeof(cf_digest);
	msg_sz += sizeof(as_msg_field) + ns_len;
	if (set_name) {
		msg_sz += sizeof(as_msg_field) + set_name_len;
	}

	int in_use_bins = as_bin_inuse_count(rd);
	int list_bins   = 0;

	if (nobindata == false) {
		if(binlist) {
			int binlist_sz = cf_vector_size(binlist);
			for(uint16_t i = 0; i < binlist_sz; i++) {
				char binname[AS_ID_BIN_SZ];
				cf_vector_get(binlist, i, (void*)&binname);
				cf_debug(AS_PROTO, " Binname projected inside is |%s| \n", binname);
				as_bin *p_bin = as_bin_get (rd, (uint8_t*)binname, strlen(binname));
				if (!p_bin)
				{
					cf_debug(AS_PROTO, "To be projected bin |%s| not found \n", binname);
					continue;
				}
				cf_debug(AS_PROTO, "Adding bin |%s| to projected bins |%s| \n", binname);
				list_bins++;
				msg_sz += sizeof(as_msg_op);
				msg_sz += rd->ns->single_bin ? 0 : strlen(binname);
				uint32_t psz;
				as_particle_tobuf(p_bin, 0, &psz); // get size
				msg_sz += psz;
			}
		}
		else {
			msg_sz += sizeof(as_msg_op) * in_use_bins; // the bin headers
			for (uint16_t i = 0; i < in_use_bins; i++) {
				as_bin *p_bin = &rd->bins[i];
				msg_sz += rd->ns->single_bin ? 0 : strlen(as_bin_get_name_from_id(rd->ns, p_bin->id));
				uint32_t psz;
				as_particle_tobuf(p_bin, 0, &psz); // get size
				msg_sz += psz;
			}
		}
	}
	return msg_sz;
}
示例#5
0
/* Internal Function: Does the post processing for the UDF record after the
 *					  UDF execution. Does the following:
 *		1. Record is closed
 *		2. urecord_op is updated to delete in case there is no bin left in it.
 *		3. record->pickled_buf is populated before the record is close in case
 *		   it was write operation
 *		4. UDF updates cache is cleared
 *
 *	Returns: Nothing
 *
 *	Parameters: urecord          - UDF record to operate on
 *				urecord_op (out) - Populated with the optype
 */
void
udf_rw_post_processing(udf_record *urecord, udf_optype *urecord_op, uint16_t set_id)
{
	as_storage_rd      *rd   = urecord->rd;
	as_transaction     *tr   = urecord->tr;
	as_index_ref    *r_ref   = urecord->r_ref;

	// INIT
	urecord->pickled_buf     = NULL;
	urecord->pickled_sz      = 0;
	urecord->pickled_void_time     = 0;
	as_rec_props_clear(&urecord->pickled_rec_props);
	bool udf_xdr_ship_op = false;

	// TODO: optimize not to allocate buffer if it is single
	// node cluster. No remote to send data to
	// Check if UDF has updates.
	if (urecord->flag & UDF_RECORD_FLAG_HAS_UPDATES) {
		// Check if the record is not deleted after an update
		if ( urecord->flag & UDF_RECORD_FLAG_OPEN) {
			*urecord_op = UDF_OPTYPE_WRITE;
			udf_xdr_ship_op = true;
		} 
		else {
			// If the record has updates and it is not open, 
			// and if it pre-existed it's an update followed by a delete.
			if ( urecord->flag & UDF_RECORD_FLAG_PREEXISTS) {
				*urecord_op = UDF_OPTYPE_DELETE;
				udf_xdr_ship_op = true;
			} 
			// If the record did not pre-exist and is updated
			// and it is not open, then it is create followed by
			// delete essentially no_op.
			else {
				*urecord_op = UDF_OPTYPE_NONE;
			}
		}
	} else if ((urecord->flag & UDF_RECORD_FLAG_PREEXISTS)
			   && !(urecord->flag & UDF_RECORD_FLAG_OPEN)) {
		*urecord_op  = UDF_OPTYPE_DELETE;
		udf_xdr_ship_op = true;
	} else {
		*urecord_op  = UDF_OPTYPE_READ;
	}

	cf_detail(AS_UDF, "FINISH working with LDT Record %p %p %p %p %d", &urecord,
			urecord->tr, urecord->r_ref, urecord->rd,
			(urecord->flag & UDF_RECORD_FLAG_STORAGE_OPEN));

	// If there exists a record reference but no bin of the record is in use,
	// delete the record. remove from the tree. Only LDT_RECORD here not needed
	// for LDT_SUBRECORD (only do it if requested by UDF). All the SUBRECORD of
	// removed LDT_RECORD will be lazily cleaned up by defrag.
	if (!(urecord->flag & UDF_RECORD_FLAG_IS_SUBRECORD)
			&& urecord->flag & UDF_RECORD_FLAG_OPEN
			&& !as_bin_inuse_has(rd)) {
		as_index_delete(tr->rsv.tree, &tr->keyd);
		urecord->starting_memory_bytes = 0;
		*urecord_op                    = UDF_OPTYPE_DELETE;
		udf_xdr_ship_op = true;
	} else if (*urecord_op == UDF_OPTYPE_WRITE)	{
		cf_detail(AS_UDF, "Committing Changes %"PRIx64" n_bins %d", rd->keyd, as_bin_get_n_bins(r_ref->r, rd));

		size_t  rec_props_data_size = as_storage_record_rec_props_size(rd);
		uint8_t rec_props_data[rec_props_data_size];
		if (rec_props_data_size > 0) {
			as_storage_record_set_rec_props(rd, rec_props_data);
		}

		write_local_post_processing(tr, tr->rsv.ns, NULL, &urecord->pickled_buf,
			&urecord->pickled_sz, &urecord->pickled_void_time,
			&urecord->pickled_rec_props, true/*increment_generation*/,
			NULL, r_ref->r, rd, urecord->starting_memory_bytes);

		// Now ok to accommodate a new stored key...
		if (! as_index_is_flag_set(r_ref->r, AS_INDEX_FLAG_KEY_STORED) && rd->key) {
			if (rd->ns->storage_data_in_memory) {
				as_record_allocate_key(r_ref->r, rd->key, rd->key_size);
			}

			as_index_set_flags(r_ref->r, AS_INDEX_FLAG_KEY_STORED);
		}
		// ... or drop a stored key.
		else if (as_index_is_flag_set(r_ref->r, AS_INDEX_FLAG_KEY_STORED) && ! rd->key) {
			if (rd->ns->storage_data_in_memory) {
				as_record_remove_key(r_ref->r);
			}

			as_index_clear_flags(r_ref->r, AS_INDEX_FLAG_KEY_STORED);
		}
	}

	// Collect the record information (for XDR) before closing the record
	as_generation generation = 0;
	if (urecord->flag & UDF_RECORD_FLAG_OPEN) {
		generation = r_ref->r->generation;
		set_id = as_index_get_set_id(r_ref->r);
	}
	// Close the record for all the cases
	udf_record_close(urecord, false);

	// Write to XDR pipe after closing the record, in order to release the record lock as
	// early as possible.
	if (udf_xdr_ship_op == true) {
		if (UDF_OP_IS_WRITE(*urecord_op)) {
			cf_detail(AS_UDF, "UDF write shipping for key %" PRIx64, tr->keyd);
			xdr_write(tr->rsv.ns, tr->keyd, generation, 0, false, set_id);
		} else if (UDF_OP_IS_DELETE(*urecord_op)) {
			cf_detail(AS_UDF, "UDF delete shipping for key %" PRIx64, tr->keyd);
			xdr_write(tr->rsv.ns, tr->keyd, generation, 0, true, set_id);
		}
	}

	// Replication happens when the main record replicates
	if (urecord->particle_data) {
		cf_free(urecord->particle_data);
		urecord->particle_data = 0;
	}
	udf_record_cache_free(urecord);
}
示例#6
0
// If remote record is better than local record, replace local with remote.
int
as_record_replace_if_better(as_remote_record *rr, bool is_repl_write,
		bool skip_sindex, bool do_xdr_write)
{
	as_namespace *ns = rr->rsv->ns;

	if (! as_storage_has_space(ns)) {
		cf_warning(AS_RECORD, "{%s} record replace: drives full", ns->name);
		return AS_ERR_OUT_OF_SPACE;
	}

	CF_ALLOC_SET_NS_ARENA(ns);

	as_index_tree *tree = rr->rsv->tree;

	as_index_ref r_ref;
	int rv = as_record_get_create(tree, rr->keyd, &r_ref, ns);

	if (rv < 0) {
		return AS_ERR_OUT_OF_SPACE;
	}

	bool is_create = rv == 1;
	as_index *r = r_ref.r;

	int result;

	conflict_resolution_pol policy = ns->conflict_resolution_policy;

	if (is_repl_write) {
		bool from_replica;

		if ((result = as_partition_check_source(ns, rr->rsv->p, rr->src,
				&from_replica)) != AS_OK) {
			record_replace_failed(rr, &r_ref, NULL, is_create);
			return result;
		}

		repl_write_init_repl_state(rr, from_replica);
		policy = repl_write_conflict_resolution_policy(ns);
	}

	if (! is_create && record_replace_check(r, ns) < 0) {
		record_replace_failed(rr, &r_ref, NULL, is_create);
		return AS_ERR_FORBIDDEN;
	}

	// If local record is better, no-op or fail.
	if (! is_create && (result = as_record_resolve_conflict(policy,
			r->generation, r->last_update_time, (uint16_t)rr->generation,
			rr->last_update_time)) <= 0) {
		record_replace_failed(rr, &r_ref, NULL, is_create);
		return result == 0 ? AS_ERR_RECORD_EXISTS : AS_ERR_GENERATION;
	}
	// else - remote winner - apply it.

	// If creating record, write set-ID into index.
	if (is_create) {
		if (rr->set_name && (result = as_index_set_set_w_len(r, ns,
				rr->set_name, rr->set_name_len, false)) < 0) {
			record_replace_failed(rr, &r_ref, NULL, is_create);
			return -result;
		}

		r->last_update_time = rr->last_update_time;

		// Don't write record if it would be truncated.
		if (as_truncate_record_is_truncated(r, ns)) {
			record_replace_failed(rr, &r_ref, NULL, is_create);
			return AS_OK;
		}
	}
	// else - not bothering to check that sets match.

	as_storage_rd rd;

	if (is_create) {
		as_storage_record_create(ns, r, &rd);
	}
	else {
		as_storage_record_open(ns, r, &rd);
	}

	// TODO - old pickle - remove condition in "six months".
	if (rr->is_old_pickle) {
		// Prepare to store set name, if there is one.
		rd.set_name = rr->set_name;
		rd.set_name_len = rr->set_name_len;
	}
	else {
		rd.pickle = rr->pickle;
		rd.pickle_sz = rr->pickle_sz;
		rd.orig_pickle_sz = as_flat_orig_pickle_size(rr, rd.pickle_sz);
	}

	// Note - deal with key after reading existing record (if such), in case
	// we're dropping the key.

	// Split according to configuration to replace local record.
	bool is_delete = false;

	if (ns->storage_data_in_memory) {
		if (ns->single_bin) {
			result = record_apply_dim_single_bin(rr, &rd, &is_delete);
		}
		else {
			result = record_apply_dim(rr, &rd, skip_sindex, &is_delete);
		}
	}
	else {
		if (ns->single_bin) {
			result = record_apply_ssd_single_bin(rr, &rd, &is_delete);
		}
		else {
			result = record_apply_ssd(rr, &rd, skip_sindex, &is_delete);
		}
	}

	if (result != 0) {
		record_replace_failed(rr, &r_ref, &rd, is_create);
		return result;
	}

	uint16_t set_id = as_index_get_set_id(r); // save for XDR write

	record_replaced(r, rr);

	as_storage_record_close(&rd);
	as_record_done(&r_ref, ns);

	if (do_xdr_write) {
		xdr_write_replica(rr, is_delete, set_id);
	}

	return AS_OK;
}
示例#7
0
int as_msg_make_response_bufbuilder(as_record *r, as_storage_rd *rd,
		cf_buf_builder **bb_r, bool nobindata, char *nsname, bool include_ldt_data,
		bool include_key, bool skip_empty_records, cf_vector *binlist)
{
	// Sanity checks. Either rd should be there or nobindata and nsname should be present.
	if (!(rd || (nobindata && nsname))) {
		cf_detail(AS_PROTO, "Neither storage record nor nobindata is set. Skipping the record.");
		return 0;
	}

	// figure out the size of the entire buffer
	int         set_name_len = 0;
	const char *set_name     = NULL;
	int         ns_len       = rd ? strlen(rd->ns->name) : strlen(nsname);

	if (as_index_get_set_id(r) != INVALID_SET_ID) {
		as_namespace *ns = NULL;

		if (rd) {
			ns = rd->ns;
		} else if (nsname) {
			ns = as_namespace_get_byname(nsname);
		}
		if (!ns) {
			cf_info(AS_PROTO, "Cannot get namespace, needed to get set information. Skipping record.");
			return -1;
		}
		set_name = as_index_get_set_name(r, ns);
		if (set_name) {
			set_name_len = strlen(set_name);
		}
	}

	uint8_t* key = NULL;
	uint32_t key_size = 0;

	if (include_key && as_index_is_flag_set(r, AS_INDEX_FLAG_KEY_STORED)) {
		if (! as_storage_record_get_key(rd)) {
			cf_info(AS_PROTO, "can't get key - skipping record");
			return -1;
		}

		key = rd->key;
		key_size = rd->key_size;
	}

	uint16_t n_fields = 2;
	int msg_sz = sizeof(as_msg);
	msg_sz += sizeof(as_msg_field) + sizeof(cf_digest);
	msg_sz += sizeof(as_msg_field) + ns_len;
	if (set_name) {
		n_fields++;
		msg_sz += sizeof(as_msg_field) + set_name_len;
	}
	if (key) {
		n_fields++;
		msg_sz += sizeof(as_msg_field) + key_size;
	}

	int list_bins   = 0;
	int in_use_bins = rd ? (int)as_bin_inuse_count(rd) : 0;
	as_val *ldt_bin_vals[in_use_bins];

	if (! nobindata) {
		if (binlist) {
			int binlist_sz = cf_vector_size(binlist);

			for (uint16_t i = 0; i < binlist_sz; i++) {
				char binname[AS_ID_BIN_SZ];

				cf_vector_get(binlist, i, (void*)&binname);

				as_bin *p_bin = as_bin_get(rd, binname);

				if (! p_bin) {
					continue;
				}

				msg_sz += sizeof(as_msg_op);
				msg_sz += rd->ns->single_bin ? 0 : strlen(binname);

				if (as_bin_is_hidden(p_bin)) {
					if (include_ldt_data) {
						msg_sz += (int)as_ldt_particle_client_value_size(rd, p_bin, &ldt_bin_vals[list_bins]);
					}
					else {
						ldt_bin_vals[list_bins] = NULL;
					}
				}
				else {
					msg_sz += (int)as_bin_particle_client_value_size(p_bin);
				}

				list_bins++;
			}

			// Don't return an empty record.
			if (skip_empty_records && list_bins == 0) {
				return 0;
			}
		}
		else {
			msg_sz += sizeof(as_msg_op) * in_use_bins;

			for (uint16_t i = 0; i < in_use_bins; i++) {
				as_bin *p_bin = &rd->bins[i];

				msg_sz += rd->ns->single_bin ? 0 : strlen(as_bin_get_name_from_id(rd->ns, p_bin->id));

				if (as_bin_is_hidden(p_bin)) {
					if (include_ldt_data) {
						msg_sz += (int)as_ldt_particle_client_value_size(rd, p_bin, &ldt_bin_vals[i]);
					}
					else {
						ldt_bin_vals[i] = NULL;
					}
				}
				else {
					msg_sz += (int)as_bin_particle_client_value_size(p_bin);
				}
			}
		}
	}

	uint8_t *b;
	cf_buf_builder_reserve(bb_r, msg_sz, &b);

	// set up the header
	uint8_t *buf = b;
	as_msg *msgp = (as_msg *) buf;

	msgp->header_sz = sizeof(as_msg);
	msgp->info1 = (nobindata ? AS_MSG_INFO1_GET_NOBINDATA : 0);
	msgp->info2 = 0;
	msgp->info3 = 0;
	msgp->unused = 0;
	msgp->result_code = 0;
	msgp->generation = r->generation;
	msgp->record_ttl = r->void_time;
	msgp->transaction_ttl = 0;
	msgp->n_fields = n_fields;
	if (rd) {
		if (binlist)
			msgp->n_ops = list_bins;
		else
			msgp->n_ops = in_use_bins;
	} else {
		msgp->n_ops = 0;
	}
	as_msg_swap_header(msgp);

	buf += sizeof(as_msg);

	as_msg_field *mf = (as_msg_field *) buf;
	mf->field_sz = sizeof(cf_digest) + 1;
	mf->type = AS_MSG_FIELD_TYPE_DIGEST_RIPE;
	if (rd) {
		memcpy(mf->data, &rd->keyd, sizeof(cf_digest));
	} else {
		memcpy(mf->data, &r->key, sizeof(cf_digest));
	}
	as_msg_swap_field(mf);
	buf += sizeof(as_msg_field) + sizeof(cf_digest);

	mf = (as_msg_field *) buf;
	mf->field_sz = ns_len + 1;
	mf->type = AS_MSG_FIELD_TYPE_NAMESPACE;
	if (rd) {
		memcpy(mf->data, rd->ns->name, ns_len);
	} else {
		memcpy(mf->data, nsname, ns_len);
	}
	as_msg_swap_field(mf);
	buf += sizeof(as_msg_field) + ns_len;

	if (set_name) {
		mf = (as_msg_field *) buf;
		mf->field_sz = set_name_len + 1;
		mf->type = AS_MSG_FIELD_TYPE_SET;
		memcpy(mf->data, set_name, set_name_len);
		as_msg_swap_field(mf);
		buf += sizeof(as_msg_field) + set_name_len;
	}

	if (key) {
		mf = (as_msg_field *) buf;
		mf->field_sz = key_size + 1;
		mf->type = AS_MSG_FIELD_TYPE_KEY;
		memcpy(mf->data, key, key_size);
		as_msg_swap_field(mf);
		buf += sizeof(as_msg_field) + key_size;
	}

	if (nobindata) {
		return 0;
	}

	if (binlist) {
		list_bins = 0;

		int binlist_sz = cf_vector_size(binlist);

		for (uint16_t i = 0; i < binlist_sz; i++) {
			char binname[AS_ID_BIN_SZ];
			cf_vector_get(binlist, i, (void*)&binname);

			as_bin *p_bin = as_bin_get(rd, binname);

			if (! p_bin) {
				continue;
			}

			as_msg_op *op = (as_msg_op *)buf;

			op->op = AS_MSG_OP_READ;
			op->version = 0;
			op->name_sz = as_bin_memcpy_name(rd->ns, op->name, p_bin);
			op->op_sz = 4 + op->name_sz;

			buf += sizeof(as_msg_op) + op->name_sz;

			if (as_bin_is_hidden(p_bin)) {
				buf += as_ldt_particle_to_client(ldt_bin_vals[list_bins], op);
			}
			else {
				buf += as_bin_particle_to_client(p_bin, op);
			}

			list_bins++;

			as_msg_swap_op(op);
		}
	}
	else {
		for (uint16_t i = 0; i < in_use_bins; i++) {
			as_msg_op *op = (as_msg_op *)buf;

			op->op = AS_MSG_OP_READ;
			op->version = 0;
			op->name_sz = as_bin_memcpy_name(rd->ns, op->name, &rd->bins[i]);
			op->op_sz = 4 + op->name_sz;

			buf += sizeof(as_msg_op) + op->name_sz;

			if (as_bin_is_hidden(&rd->bins[i])) {
				buf += as_ldt_particle_to_client(ldt_bin_vals[i], op);
			}
			else {
				buf += as_bin_particle_to_client(&rd->bins[i], op);
			}

			as_msg_swap_op(op);
		}
	}

	return 0;
}
int
write_replica(as_partition_reservation* rsv, cf_digest* keyd,
		uint8_t* pickled_buf, size_t pickled_sz,
		const as_rec_props* p_rec_props, as_generation generation,
		uint32_t void_time, uint64_t last_update_time, cf_node master,
		uint32_t info, ldt_prole_info* linfo)
{
	as_namespace* ns = rsv->ns;

	if (! as_storage_has_space(rsv->ns)) {
		cf_warning(AS_RW, "{%s} write_replica: drives full", ns->name);
		return AS_PROTO_RESULT_FAIL_PARTITION_OUT_OF_SPACE;
	}

	as_index_tree* tree = rsv->tree;
	bool is_subrec = false;
	bool is_ldt_parent = false;

	if (ns->ldt_enabled) {
		if ((info & RW_INFO_LDT_SUBREC) != 0 || (info & RW_INFO_LDT_ESR) != 0) {
			tree = rsv->sub_tree;
			is_subrec = true;
		}
		else if ((info & RW_INFO_LDT_PARENTREC) != 0) {
			is_ldt_parent = true;
		}
	}

	as_index_ref r_ref;
	r_ref.skip_lock = false;

	int rv = as_record_get_create(tree, keyd, &r_ref, ns, is_subrec);

	if (rv < 0) {
		cf_warning_digest(AS_RW, keyd, "{%s} write_replica: fail as_record_get_create() ", ns->name);
		return AS_PROTO_RESULT_FAIL_UNKNOWN;
	}

	as_record* r = r_ref.r;
	as_storage_rd rd;
	bool is_create = false;

	if (rv == 1) {
		as_storage_record_create(ns, r, &rd, keyd);
		is_create = true;
	}
	else {
		as_storage_record_open(ns, r, &rd, keyd);
	}

	bool has_sindex = (info & RW_INFO_SINDEX_TOUCHED) != 0;

	rd.ignore_record_on_device = ! has_sindex && ! is_ldt_parent;
	rd.n_bins = as_bin_get_n_bins(r, &rd);

	// TODO - we really need an inline utility for this!
	uint16_t newbins = ntohs(*(uint16_t*)pickled_buf);

	if (! rd.ns->storage_data_in_memory && ! rd.ns->single_bin &&
			newbins > rd.n_bins) {
		rd.n_bins = newbins;
	}

	as_bin stack_bins[rd.ns->storage_data_in_memory ? 0 : rd.n_bins];

	rd.bins = as_bin_get_all(r, &rd, stack_bins);

	uint32_t stack_particles_sz = rd.ns->storage_data_in_memory ?
			0 : as_record_buf_get_stack_particles_sz(pickled_buf);
	uint8_t stack_particles[stack_particles_sz + 256];
	uint8_t* p_stack_particles = stack_particles;
	// + 256 for LDT control bin, to hold version.

	if (! ldt_get_prole_version(rsv, keyd, linfo, info, &rd, is_create)) {
		if (is_create) {
			as_index_delete(tree, keyd);
		}

		as_storage_record_close(r, &rd);
		as_record_done(&r_ref, ns);

		return AS_PROTO_RESULT_FAIL_UNKNOWN;
	}

	uint64_t memory_bytes = 0;

	if (! is_create) {
		memory_bytes = as_storage_record_get_n_bytes_memory(&rd);
	}

	as_record_set_properties(&rd, p_rec_props);

	if (as_record_unpickle_replace(r, &rd, pickled_buf, pickled_sz,
			&p_stack_particles, has_sindex) != 0) {
		if (is_create) {
			as_index_delete(tree, keyd);
		}

		as_storage_record_close(r, &rd);
		as_record_done(&r_ref, ns);

		return AS_PROTO_RESULT_FAIL_UNKNOWN; // TODO - better granularity?
	}

	r->generation = generation;
	r->void_time = void_time;
	r->last_update_time = last_update_time;

	as_storage_record_adjust_mem_stats(&rd, memory_bytes);

	uint64_t version_to_set = 0;
	bool set_version = false;

	if (is_ldt_parent) {
		if (linfo->replication_partition_version_match &&
				linfo->ldt_prole_version_set) {
			version_to_set = linfo->ldt_prole_version;
			set_version = true;
		}
		else if (! linfo->replication_partition_version_match) {
			version_to_set = linfo->ldt_source_version;
			set_version = true;
		}
	}

	if (set_version) {
		int ldt_rv = as_ldt_parent_storage_set_version(&rd, version_to_set,
				p_stack_particles, __FILE__, __LINE__);

		if (ldt_rv < 0) {
			cf_warning(AS_LDT, "write_replica: LDT parent storage version set failed %d", ldt_rv);
			// TODO - roll back.
		}
	}

	bool is_delete = false;

	if (! as_bin_inuse_has(&rd)) {
		// A master write that deletes a record by deleting (all) bins sends a
		// binless pickle that ends up here.
		is_delete = true;
		as_index_delete(tree, keyd);
	}

	as_storage_record_write(r, &rd);
	as_storage_record_close(r, &rd);

	uint16_t set_id = as_index_get_set_id(r);

	as_record_done(&r_ref, ns);

	// Don't send an XDR delete if it's disallowed.
	if (is_delete && ! is_xdr_delete_shipping_enabled()) {
		// TODO - should we also not ship if there was no record here before?
		return AS_PROTO_RESULT_OK;
	}

	// Do XDR write if the write is a non-XDR write or forwarding is enabled.
	if ((info & RW_INFO_XDR) == 0 ||
			is_xdr_forwarding_enabled() || ns->ns_forward_xdr_writes) {
		xdr_write(ns, *keyd, generation, master, is_delete, set_id, NULL);
	}

	return AS_PROTO_RESULT_OK;
}
示例#9
0
/* Internal Function: Does the post processing for the UDF record after the
 *					  UDF execution. Does the following:
 *		1. Record is closed
 *		2. urecord_op is updated to delete in case there is no bin left in it.
 *		3. record->pickled_buf is populated before the record is close in case
 *		   it was write operation
 *		4. UDF updates cache is cleared
 *
 *	Returns: Nothing
 *
 *	Parameters: urecord          - UDF record to operate on
 *				urecord_op (out) - Populated with the optype
 */
static void
post_processing(udf_record *urecord, udf_optype *urecord_op, uint16_t set_id)
{
	as_storage_rd      *rd   = urecord->rd;
	as_transaction     *tr   = urecord->tr;
	as_index_ref    *r_ref   = urecord->r_ref;

	// INIT
	urecord->pickled_buf     = NULL;
	urecord->pickled_sz      = 0;
	as_rec_props_clear(&urecord->pickled_rec_props);
	bool udf_xdr_ship_op = false;

	getop(urecord, urecord_op);

	if (UDF_OP_IS_DELETE(*urecord_op)
			|| UDF_OP_IS_WRITE(*urecord_op)) {
		udf_xdr_ship_op = true;
	}

	cf_detail(AS_UDF, "FINISH working with LDT Record %p %p %p %p %d", &urecord,
			urecord->tr, urecord->r_ref, urecord->rd,
			(urecord->flag & UDF_RECORD_FLAG_STORAGE_OPEN));

	// If there exists a record reference but no bin of the record is in use,
	// delete the record. remove from the tree. Only LDT_RECORD here not needed
	// for LDT_SUBRECORD (only do it if requested by UDF). All the SUBRECORD of
	// removed LDT_RECORD will be lazily cleaned up by defrag.
	if (udf_zero_bins_left(urecord)) {
		as_transaction *tr = urecord->tr;
		as_index_delete(tr->rsv.tree, &tr->keyd);
		urecord->starting_memory_bytes = 0;
		*urecord_op                    = UDF_OPTYPE_DELETE;
	}
	else if (*urecord_op == UDF_OPTYPE_WRITE)	{
		cf_detail_digest(AS_UDF, &rd->keyd, "Committing Changes n_bins %d", as_bin_get_n_bins(r_ref->r, rd));

		size_t  rec_props_data_size = as_storage_record_rec_props_size(rd);
		uint8_t rec_props_data[rec_props_data_size];
		if (rec_props_data_size > 0) {
			as_storage_record_set_rec_props(rd, rec_props_data);
		}

		write_udf_post_processing(tr, rd, &urecord->pickled_buf,
			&urecord->pickled_sz, &urecord->pickled_rec_props,
			urecord->starting_memory_bytes);

		// Now ok to accommodate a new stored key...
		if (! as_index_is_flag_set(r_ref->r, AS_INDEX_FLAG_KEY_STORED) && rd->key) {
			if (rd->ns->storage_data_in_memory) {
				as_record_allocate_key(r_ref->r, rd->key, rd->key_size);
			}

			as_index_set_flags(r_ref->r, AS_INDEX_FLAG_KEY_STORED);
		}
		// ... or drop a stored key.
		else if (as_index_is_flag_set(r_ref->r, AS_INDEX_FLAG_KEY_STORED) && ! rd->key) {
			if (rd->ns->storage_data_in_memory) {
				as_record_remove_key(r_ref->r);
			}

			as_index_clear_flags(r_ref->r, AS_INDEX_FLAG_KEY_STORED);
		}
	}

	// Collect the record information (for XDR) before closing the record
	as_generation generation = 0;
	if (urecord->flag & UDF_RECORD_FLAG_OPEN) {
		generation = r_ref->r->generation;
		set_id = as_index_get_set_id(r_ref->r);
	}
	urecord->op = *urecord_op;
	// Close the record for all the cases
	udf_record_close(urecord);

	// Write to XDR pipe after closing the record, in order to release the record lock as
	// early as possible.
	if (udf_xdr_ship_op == true) {
		if (UDF_OP_IS_WRITE(*urecord_op)) {
			cf_detail(AS_UDF, "UDF write shipping for key %" PRIx64, tr->keyd);
			xdr_write(tr->rsv.ns, tr->keyd, generation, 0, false, set_id);
		} else if (UDF_OP_IS_DELETE(*urecord_op)) {
			cf_detail(AS_UDF, "UDF delete shipping for key %" PRIx64, tr->keyd);
			xdr_write(tr->rsv.ns, tr->keyd, generation, 0, true, set_id);
		}
	}
}