Beispiel #1
0
// If remote record is better than local record, replace local with remote.
int
as_record_replace_if_better(as_remote_record *rr, bool is_repl_write,
		bool skip_sindex, bool do_xdr_write)
{
	as_namespace *ns = rr->rsv->ns;

	if (! as_storage_has_space(ns)) {
		cf_warning(AS_RECORD, "{%s} record replace: drives full", ns->name);
		return AS_ERR_OUT_OF_SPACE;
	}

	CF_ALLOC_SET_NS_ARENA(ns);

	as_index_tree *tree = rr->rsv->tree;

	as_index_ref r_ref;
	int rv = as_record_get_create(tree, rr->keyd, &r_ref, ns);

	if (rv < 0) {
		return AS_ERR_OUT_OF_SPACE;
	}

	bool is_create = rv == 1;
	as_index *r = r_ref.r;

	int result;

	conflict_resolution_pol policy = ns->conflict_resolution_policy;

	if (is_repl_write) {
		bool from_replica;

		if ((result = as_partition_check_source(ns, rr->rsv->p, rr->src,
				&from_replica)) != AS_OK) {
			record_replace_failed(rr, &r_ref, NULL, is_create);
			return result;
		}

		repl_write_init_repl_state(rr, from_replica);
		policy = repl_write_conflict_resolution_policy(ns);
	}

	if (! is_create && record_replace_check(r, ns) < 0) {
		record_replace_failed(rr, &r_ref, NULL, is_create);
		return AS_ERR_FORBIDDEN;
	}

	// If local record is better, no-op or fail.
	if (! is_create && (result = as_record_resolve_conflict(policy,
			r->generation, r->last_update_time, (uint16_t)rr->generation,
			rr->last_update_time)) <= 0) {
		record_replace_failed(rr, &r_ref, NULL, is_create);
		return result == 0 ? AS_ERR_RECORD_EXISTS : AS_ERR_GENERATION;
	}
	// else - remote winner - apply it.

	// If creating record, write set-ID into index.
	if (is_create) {
		if (rr->set_name && (result = as_index_set_set_w_len(r, ns,
				rr->set_name, rr->set_name_len, false)) < 0) {
			record_replace_failed(rr, &r_ref, NULL, is_create);
			return -result;
		}

		r->last_update_time = rr->last_update_time;

		// Don't write record if it would be truncated.
		if (as_truncate_record_is_truncated(r, ns)) {
			record_replace_failed(rr, &r_ref, NULL, is_create);
			return AS_OK;
		}
	}
	// else - not bothering to check that sets match.

	as_storage_rd rd;

	if (is_create) {
		as_storage_record_create(ns, r, &rd);
	}
	else {
		as_storage_record_open(ns, r, &rd);
	}

	// TODO - old pickle - remove condition in "six months".
	if (rr->is_old_pickle) {
		// Prepare to store set name, if there is one.
		rd.set_name = rr->set_name;
		rd.set_name_len = rr->set_name_len;
	}
	else {
		rd.pickle = rr->pickle;
		rd.pickle_sz = rr->pickle_sz;
		rd.orig_pickle_sz = as_flat_orig_pickle_size(rr, rd.pickle_sz);
	}

	// Note - deal with key after reading existing record (if such), in case
	// we're dropping the key.

	// Split according to configuration to replace local record.
	bool is_delete = false;

	if (ns->storage_data_in_memory) {
		if (ns->single_bin) {
			result = record_apply_dim_single_bin(rr, &rd, &is_delete);
		}
		else {
			result = record_apply_dim(rr, &rd, skip_sindex, &is_delete);
		}
	}
	else {
		if (ns->single_bin) {
			result = record_apply_ssd_single_bin(rr, &rd, &is_delete);
		}
		else {
			result = record_apply_ssd(rr, &rd, skip_sindex, &is_delete);
		}
	}

	if (result != 0) {
		record_replace_failed(rr, &r_ref, &rd, is_create);
		return result;
	}

	uint16_t set_id = as_index_get_set_id(r); // save for XDR write

	record_replaced(r, rr);

	as_storage_record_close(&rd);
	as_record_done(&r_ref, ns);

	if (do_xdr_write) {
		xdr_write_replica(rr, is_delete, set_id);
	}

	return AS_OK;
}
/*
 * Internal function: udf_aerospike__apply_update_atomic
 *
 * Parameters:
 * 		rec --	udf_record to be updated
 *
 * Return Values:
 * 		 0 success
 * 		-1 failure
 *
 * Description:
 * 		This function applies all the updates atomically. That is,
 * 		if one of the bin update/delete/create fails, the entire function
 * 		will fail. If the nth update fails, all the n-1 updates are rolled
 * 		back to their initial values
 *
 * 		Special Notes:
 * 		i. The basic checks of bin name being too long or if there is enough space
 * 		on the disk for the bin values is done before allocating space for any
 * 		of the bins.
 *
 * 		ii. If one of the updates to be rolled back is a bin creation,
 * 		udf_aerospike_delbin is called. This will not free up the bin metadata.
 * 		So there will be a small memory mismatch b/w replica (which did not get the
 * 		record at all and hence no memory is accounted) and the master will be seen.
 * 		To avoid such cases, we are doing checks upfront.
 *
 * 		Callers:
 * 		udf_aerospike__execute_updates
 * 		In this function, if udf_aerospike__apply_update_atomic fails, the record
 * 		is not committed to the storage. On success, record is closed which commits to
 * 		the storage and reopened for the next set of udf updates.
 * 		The return value from udf_aerospike__apply_update_atomic is passed on to the
 * 		callers of this function.
 */
int
udf_aerospike__apply_update_atomic(udf_record *urecord)
{
	int rc						= 0;
	int failmax					= 0;
	int new_bins				= 0;	// How many new bins have to be created in this update
	as_storage_rd * rd			= urecord->rd;
	as_namespace * ns			= rd->ns;
	bool has_sindex				= as_sindex_ns_has_sindex(ns);
	bool is_record_dirty		= false;
	bool is_record_flag_dirty	= false;
	uint8_t old_index_flags		= as_index_get_flags(rd->r);
	uint8_t new_index_flags		= 0;

	// This will iterate over all the updates and apply them to storage.
	// The items will remain, and be used as cache values. If an error
	// occurred during setbin(), we rollback all the operation which
	// is and return failure
	cf_detail(AS_UDF, "execute updates: %d updates", urecord->nupdates);

	// loop twice to make sure the updates are performed first so in case
	// something wrong it can be rolled back. The deletes will go through
	// successfully generally.

	// In first iteration, just calculate how many new bins need to be created
	for(uint32_t i = 0; i < urecord->nupdates; i++ ) {
		if ( urecord->updates[i].dirty ) {
			char *      k = urecord->updates[i].name;
			if ( k != NULL ) {
				if ( !as_bin_get(rd, k) ) {
					new_bins++;
				}
			}
		}
	}
	// Free bins - total bins not in use in the record
	// Delta bins - new bins that need to be created
	int inuse_bins = as_bin_inuse_count(rd);
	int free_bins  = rd->n_bins - inuse_bins;
	int delta_bins = new_bins - free_bins;
	cf_detail(AS_UDF, "Total bins %d, In use bins %d, Free bins %d , New bins %d, Delta bins %d",
			  rd->n_bins, as_bin_inuse_count(urecord->rd), free_bins, new_bins, delta_bins);

	// Check bin usage limit.
	if ((inuse_bins + new_bins > UDF_RECORD_BIN_ULIMIT) ||
			(urecord->flag & UDF_RECORD_FLAG_TOO_MANY_BINS)) {
		cf_warning(AS_UDF, "bin limit of %d for UDF exceeded: %d bins in use, %d bins free, %s%d new bins needed",
				(int)UDF_RECORD_BIN_ULIMIT, inuse_bins, free_bins,
				(urecord->flag & UDF_RECORD_FLAG_TOO_MANY_BINS) ? ">" : "", new_bins);
		goto Rollback;
	}

	// Allocate space for all the new bins that need to be created beforehand
	if (delta_bins > 0 && rd->ns->storage_data_in_memory && ! rd->ns->single_bin) {
		as_bin_allocate_bin_space(urecord->r_ref->r, rd, delta_bins);
	}

	if (!rd->ns->storage_data_in_memory && !urecord->particle_data) {
		// 256 as upper bound on the LDT control bin, we may write version below
		// leave it at the end for its use
		urecord->particle_data = cf_malloc(rd->ns->storage_write_block_size + 256);
		urecord->cur_particle_data = urecord->particle_data;
		urecord->end_particle_data = urecord->particle_data + rd->ns->storage_write_block_size;
	}

	if (has_sindex) {
		SINDEX_GRLOCK();
	}

	// In second iteration apply updates.
	for(uint32_t i = 0; i < urecord->nupdates; i++ ) {
		urecord->updates[i].oldvalue  = NULL;
		urecord->updates[i].washidden = false;
		if ( urecord->updates[i].dirty && rc == 0) {

			char *      k = urecord->updates[i].name;
			as_val *    v = urecord->updates[i].value;
			bool        h = urecord->updates[i].ishidden;

			if ( k != NULL ) {
				if ( v == NULL || v->type == AS_NIL ) {
					// if the value is NIL, then do a delete
					cf_detail(AS_UDF, "execute update: position %d deletes bin %s", i, k);
					urecord->updates[i].oldvalue = udf_record_storage_get(urecord, k);
					urecord->updates[i].washidden = udf_record_bin_ishidden(urecord, k);
					// Only case delete fails if bin is not found that is 
					// as good as delete. Ignore return code !!
					udf_aerospike_delbin(urecord, k);

					if (urecord->dirty != NULL) {
						xdr_fill_dirty_bins(urecord->dirty);
					}
				}
				else {
					// otherwise, it is a set
					cf_detail(AS_UDF, "execute update: position %d sets bin %s", i, k);
					urecord->updates[i].oldvalue = udf_record_storage_get(urecord, k);
					urecord->updates[i].washidden = udf_record_bin_ishidden(urecord, k);
					rc = udf_aerospike_setbin(urecord, i, k, v, h);
					if (rc) {
						if (urecord->updates[i].oldvalue) {
							as_val_destroy(urecord->updates[i].oldvalue);
							urecord->updates[i].oldvalue = NULL;
						} 
						failmax = i;
						goto Rollback;
					}

					if (urecord->dirty != NULL) {
						xdr_add_dirty_bin(ns, urecord->dirty, k, strlen(k));
					}
				}
			}

			is_record_dirty = true;
		}
	}

	if (urecord->ldt_rectype_bit_update) {
		if (urecord->ldt_rectype_bit_update < 0) {
			// ldt_rectype_bit_update is negative in case we want to reset the bits 
			uint8_t rectype_bits = urecord->ldt_rectype_bit_update * -1; 
			new_index_flags = old_index_flags & ~rectype_bits;
		} else { 
			new_index_flags = old_index_flags | urecord->ldt_rectype_bit_update;  
		} 

		if (new_index_flags != old_index_flags) {
			as_index_clear_flags(rd->r, old_index_flags);
			as_index_set_flags(rd->r, new_index_flags);
			is_record_flag_dirty = true;
			cf_detail_digest(AS_RW, &urecord->tr->keyd, "Setting index flags from %d to %d new flag %d", old_index_flags, new_index_flags, as_index_get_flags(rd->r));
		}
	}

	{
		// This is _NOT_ for writing to the storage but for simply performing sizing
		// calculation. If we know the upper bounds of size of rec_props.. we could 
		// avoid this work and check with that much correction ... 
		//
		// See
		//  - udf_rw_post_processing for building rec_props for replication
		//  - udf_record_close for building rec_props for writing it to storage
		size_t  rec_props_data_size = as_storage_record_rec_props_size(rd);
		uint8_t rec_props_data[rec_props_data_size];
		if (rec_props_data_size > 0) {
			as_storage_record_set_rec_props(rd, rec_props_data);
		}

		// Version is set in the end after record size check. Setting version won't change the size of
		// the record. And if it were before size check then this setting of version as well needs to
		// be backed out.
		// TODO: Add backout logic would work till very first create call of LDT end up crossing over
		// record boundary
		if (rd->ns->ldt_enabled && as_ldt_record_is_parent(rd->r)) {
			int rv = as_ldt_parent_storage_set_version(rd, urecord->lrecord->version, urecord->end_particle_data, __FILE__, __LINE__);
			if (rv < 0) {
				cf_warning(AS_LDT, "udf_aerospike__apply_update_atomic: Internal Error "
							" [Failed to set the version on storage rv=%d]... Fail",rv);
				goto Rollback;
			}
			// TODO - if size check below fails, won't write to device -
			// different behavior than write_to_device flag - OK?
			is_record_dirty = true;
		}

		if (! as_storage_record_size_and_check(rd)) {
			cf_warning(AS_UDF, "record failed storage size check, will not be updated");
			failmax = (int)urecord->nupdates;
			goto Rollback;
		}

		if (cf_atomic32_get(rd->ns->stop_writes) == 1) {
			cf_warning(AS_UDF, "UDF failed by stop-writes, record will not be updated");
			failmax = (int)urecord->nupdates;
			goto Rollback;
		}

		if (! as_storage_has_space(rd->ns)) {
			cf_warning(AS_UDF, "drives full, record will not be updated");
			failmax = (int)urecord->nupdates;
			goto Rollback;
		}

		if (! is_valid_ttl(rd->ns, urecord->tr->msgp->msg.record_ttl)) {
			cf_warning(AS_UDF, "invalid ttl %u", urecord->tr->msgp->msg.record_ttl);
			failmax = (int)urecord->nupdates;
			goto Rollback;
		}
	}

	if (has_sindex) {
		SINDEX_GUNLOCK();
	}

	// If there were updates do miscellaneous successful commit
	// tasks
	if (is_record_dirty 
			|| is_record_flag_dirty
			|| (urecord->flag & UDF_RECORD_FLAG_METADATA_UPDATED)) {
		urecord->flag |= UDF_RECORD_FLAG_HAS_UPDATES; // will write to storage
	}
	urecord->ldt_rectype_bit_update = 0;

	// Clean up oldvalue cache and reset dirty. All the changes made 
	// here has made to the particle buffer. Nothing will now be backed out.
	for (uint32_t i = 0; i < urecord->nupdates; i++) {
		udf_record_bin * bin = &urecord->updates[i];
		if (bin->oldvalue != NULL ) {
			as_val_destroy(bin->oldvalue);
			bin->oldvalue = NULL;
		}
		bin->dirty    = false;
	}
	return rc;

Rollback:
	cf_debug(AS_UDF, "Rollback Called: failmax %d", failmax);
	for (int i = 0; i < failmax; i++) {
		if (urecord->updates[i].dirty) {
			char *      k = urecord->updates[i].name;
			// Pick the oldvalue for rollback
			as_val *    v = urecord->updates[i].oldvalue;
			bool        h = urecord->updates[i].washidden;
			if ( k != NULL ) {
				if ( v == NULL || v->type == AS_NIL ) {
					// if the value is NIL, then do a delete
					cf_detail(AS_UDF, "execute rollback: position %d deletes bin %s", i, k);
					rc = udf_aerospike_delbin(urecord, k);
				}
				else {
					// otherwise, it is a set
					cf_detail(AS_UDF, "execute rollback: position %d sets bin %s", i, k);
					rc = udf_aerospike_setbin(urecord, i, k, v, h);
					if (rc) {
						cf_warning(AS_UDF, "Rollback failed .. not good ... !!");
					}
				}
			}
			if (v) {
				as_val_destroy(v);
				cf_debug(AS_UDF, "ROLLBACK as_val_destroy()");
			}
		}
	}

	if (is_record_dirty && urecord->dirty != NULL) {
		xdr_clear_dirty_bins(urecord->dirty);
	}

	if (is_record_flag_dirty) {
		as_index_clear_flags(rd->r, new_index_flags);
		as_index_set_flags(rd->r, old_index_flags);
		is_record_flag_dirty = false;
	}
	urecord->ldt_rectype_bit_update = 0;

	if (has_sindex) {
		SINDEX_GUNLOCK();
	}

	// Reset the flat size in case the stuff is backedout !!! it should not
	// fail in the backout code ...
	if (! as_storage_record_size_and_check(rd)) {
		cf_warning(AS_LDT, "Does not fit even after rollback... it is trouble");
	}

	// Do not clean up the cache in case of failure
	return -1;
}
int
write_replica(as_partition_reservation* rsv, cf_digest* keyd,
		uint8_t* pickled_buf, size_t pickled_sz,
		const as_rec_props* p_rec_props, as_generation generation,
		uint32_t void_time, uint64_t last_update_time, cf_node master,
		uint32_t info, ldt_prole_info* linfo)
{
	as_namespace* ns = rsv->ns;

	if (! as_storage_has_space(rsv->ns)) {
		cf_warning(AS_RW, "{%s} write_replica: drives full", ns->name);
		return AS_PROTO_RESULT_FAIL_PARTITION_OUT_OF_SPACE;
	}

	as_index_tree* tree = rsv->tree;
	bool is_subrec = false;
	bool is_ldt_parent = false;

	if (ns->ldt_enabled) {
		if ((info & RW_INFO_LDT_SUBREC) != 0 || (info & RW_INFO_LDT_ESR) != 0) {
			tree = rsv->sub_tree;
			is_subrec = true;
		}
		else if ((info & RW_INFO_LDT_PARENTREC) != 0) {
			is_ldt_parent = true;
		}
	}

	as_index_ref r_ref;
	r_ref.skip_lock = false;

	int rv = as_record_get_create(tree, keyd, &r_ref, ns, is_subrec);

	if (rv < 0) {
		cf_warning_digest(AS_RW, keyd, "{%s} write_replica: fail as_record_get_create() ", ns->name);
		return AS_PROTO_RESULT_FAIL_UNKNOWN;
	}

	as_record* r = r_ref.r;
	as_storage_rd rd;
	bool is_create = false;

	if (rv == 1) {
		as_storage_record_create(ns, r, &rd, keyd);
		is_create = true;
	}
	else {
		as_storage_record_open(ns, r, &rd, keyd);
	}

	bool has_sindex = (info & RW_INFO_SINDEX_TOUCHED) != 0;

	rd.ignore_record_on_device = ! has_sindex && ! is_ldt_parent;
	rd.n_bins = as_bin_get_n_bins(r, &rd);

	// TODO - we really need an inline utility for this!
	uint16_t newbins = ntohs(*(uint16_t*)pickled_buf);

	if (! rd.ns->storage_data_in_memory && ! rd.ns->single_bin &&
			newbins > rd.n_bins) {
		rd.n_bins = newbins;
	}

	as_bin stack_bins[rd.ns->storage_data_in_memory ? 0 : rd.n_bins];

	rd.bins = as_bin_get_all(r, &rd, stack_bins);

	uint32_t stack_particles_sz = rd.ns->storage_data_in_memory ?
			0 : as_record_buf_get_stack_particles_sz(pickled_buf);
	uint8_t stack_particles[stack_particles_sz + 256];
	uint8_t* p_stack_particles = stack_particles;
	// + 256 for LDT control bin, to hold version.

	if (! ldt_get_prole_version(rsv, keyd, linfo, info, &rd, is_create)) {
		if (is_create) {
			as_index_delete(tree, keyd);
		}

		as_storage_record_close(r, &rd);
		as_record_done(&r_ref, ns);

		return AS_PROTO_RESULT_FAIL_UNKNOWN;
	}

	uint64_t memory_bytes = 0;

	if (! is_create) {
		memory_bytes = as_storage_record_get_n_bytes_memory(&rd);
	}

	as_record_set_properties(&rd, p_rec_props);

	if (as_record_unpickle_replace(r, &rd, pickled_buf, pickled_sz,
			&p_stack_particles, has_sindex) != 0) {
		if (is_create) {
			as_index_delete(tree, keyd);
		}

		as_storage_record_close(r, &rd);
		as_record_done(&r_ref, ns);

		return AS_PROTO_RESULT_FAIL_UNKNOWN; // TODO - better granularity?
	}

	r->generation = generation;
	r->void_time = void_time;
	r->last_update_time = last_update_time;

	as_storage_record_adjust_mem_stats(&rd, memory_bytes);

	uint64_t version_to_set = 0;
	bool set_version = false;

	if (is_ldt_parent) {
		if (linfo->replication_partition_version_match &&
				linfo->ldt_prole_version_set) {
			version_to_set = linfo->ldt_prole_version;
			set_version = true;
		}
		else if (! linfo->replication_partition_version_match) {
			version_to_set = linfo->ldt_source_version;
			set_version = true;
		}
	}

	if (set_version) {
		int ldt_rv = as_ldt_parent_storage_set_version(&rd, version_to_set,
				p_stack_particles, __FILE__, __LINE__);

		if (ldt_rv < 0) {
			cf_warning(AS_LDT, "write_replica: LDT parent storage version set failed %d", ldt_rv);
			// TODO - roll back.
		}
	}

	bool is_delete = false;

	if (! as_bin_inuse_has(&rd)) {
		// A master write that deletes a record by deleting (all) bins sends a
		// binless pickle that ends up here.
		is_delete = true;
		as_index_delete(tree, keyd);
	}

	as_storage_record_write(r, &rd);
	as_storage_record_close(r, &rd);

	uint16_t set_id = as_index_get_set_id(r);

	as_record_done(&r_ref, ns);

	// Don't send an XDR delete if it's disallowed.
	if (is_delete && ! is_xdr_delete_shipping_enabled()) {
		// TODO - should we also not ship if there was no record here before?
		return AS_PROTO_RESULT_OK;
	}

	// Do XDR write if the write is a non-XDR write or forwarding is enabled.
	if ((info & RW_INFO_XDR) == 0 ||
			is_xdr_forwarding_enabled() || ns->ns_forward_xdr_writes) {
		xdr_write(ns, *keyd, generation, master, is_delete, set_id, NULL);
	}

	return AS_PROTO_RESULT_OK;
}