/*
 * Function: Open storage record for passed in udf record
 *           also set up flag like exists / read et al.
 *
 * Parameters:
 * 		urec    : UDF record
 *
 * Return value :  0 on success
 * 				  -1 if the record's bin count exceeds the UDF limit
 *
 * Callers:
 * 		udf_record_open
 *
 * Note: There are no checks, so the caller has to make sure that all
 *       protections are taken and all checks are done.
 *
 *  Side effect:
 *  	Counters will be reset
 *  	flag will be set
 *  	bins will be opened
 */
int
udf_storage_record_open(udf_record *urecord)
{
	cf_debug_digest(AS_UDF, &urecord->tr->keyd, "[ENTER] Opening record key:");
	as_storage_rd  *rd    = urecord->rd;
	as_index       *r	  = urecord->r_ref->r;
	as_transaction *tr    = urecord->tr;
	int rv = as_storage_record_open(tr->rsv.ns, r, rd, &r->key);
	if (0 != rv) {
		cf_warning(AS_UDF, "Could not open record !! %d", rv);
		return rv;
	}
	rd->n_bins = as_bin_get_n_bins(r, rd);

	if (rd->n_bins > UDF_RECORD_BIN_ULIMIT) {
		cf_warning(AS_UDF, "record has too many bins (%d) for UDF processing", rd->n_bins);
		as_storage_record_close(r, rd);
		return -1;
	}

	// if multibin storage, we will use urecord->stack_bins, so set the size appropriately
	if ( ! tr->rsv.ns->storage_data_in_memory && ! tr->rsv.ns->single_bin ) {
		rd->n_bins = sizeof(urecord->stack_bins) / sizeof(as_bin);
	}

	rd->bins = as_bin_get_all(r, rd, urecord->stack_bins);
	urecord->starting_memory_bytes = as_storage_record_get_n_bytes_memory(rd);

	as_storage_record_get_key(rd);

	urecord->flag   |= UDF_RECORD_FLAG_STORAGE_OPEN;

	if (urecord->flag & UDF_RECORD_FLAG_IS_SUBRECORD) {
		urecord->lrecord->subrec_io++;
	}

	cf_detail_digest(AS_UDF, &tr->keyd, "Storage Open: Rec(%p) flag(%x) Digest:", urecord, urecord->flag );
	if (urecord->flag & UDF_RECORD_FLAG_IS_SUBRECORD) {
		as_ldt_subrec_storage_validate(rd, "Reading");
	}
	return 0;
}
Exemple #2
0
/* Internal Function: Does the post processing for the UDF record after the
 *					  UDF execution. Does the following:
 *		1. Record is closed
 *		2. urecord_op is updated to delete in case there is no bin left in it.
 *		3. record->pickled_buf is populated before the record is close in case
 *		   it was write operation
 *		4. UDF updates cache is cleared
 *
 *	Returns: Nothing
 *
 *	Parameters: urecord          - UDF record to operate on
 *				urecord_op (out) - Populated with the optype
 */
void
udf_rw_post_processing(udf_record *urecord, udf_optype *urecord_op, uint16_t set_id)
{
	as_storage_rd      *rd   = urecord->rd;
	as_transaction     *tr   = urecord->tr;
	as_index_ref    *r_ref   = urecord->r_ref;

	// INIT
	urecord->pickled_buf     = NULL;
	urecord->pickled_sz      = 0;
	urecord->pickled_void_time     = 0;
	as_rec_props_clear(&urecord->pickled_rec_props);
	bool udf_xdr_ship_op = false;

	// TODO: optimize not to allocate buffer if it is single
	// node cluster. No remote to send data to
	// Check if UDF has updates.
	if (urecord->flag & UDF_RECORD_FLAG_HAS_UPDATES) {
		// Check if the record is not deleted after an update
		if ( urecord->flag & UDF_RECORD_FLAG_OPEN) {
			*urecord_op = UDF_OPTYPE_WRITE;
			udf_xdr_ship_op = true;
		} 
		else {
			// If the record has updates and it is not open, 
			// and if it pre-existed it's an update followed by a delete.
			if ( urecord->flag & UDF_RECORD_FLAG_PREEXISTS) {
				*urecord_op = UDF_OPTYPE_DELETE;
				udf_xdr_ship_op = true;
			} 
			// If the record did not pre-exist and is updated
			// and it is not open, then it is create followed by
			// delete essentially no_op.
			else {
				*urecord_op = UDF_OPTYPE_NONE;
			}
		}
	} else if ((urecord->flag & UDF_RECORD_FLAG_PREEXISTS)
			   && !(urecord->flag & UDF_RECORD_FLAG_OPEN)) {
		*urecord_op  = UDF_OPTYPE_DELETE;
		udf_xdr_ship_op = true;
	} else {
		*urecord_op  = UDF_OPTYPE_READ;
	}

	cf_detail(AS_UDF, "FINISH working with LDT Record %p %p %p %p %d", &urecord,
			urecord->tr, urecord->r_ref, urecord->rd,
			(urecord->flag & UDF_RECORD_FLAG_STORAGE_OPEN));

	// If there exists a record reference but no bin of the record is in use,
	// delete the record. remove from the tree. Only LDT_RECORD here not needed
	// for LDT_SUBRECORD (only do it if requested by UDF). All the SUBRECORD of
	// removed LDT_RECORD will be lazily cleaned up by defrag.
	if (!(urecord->flag & UDF_RECORD_FLAG_IS_SUBRECORD)
			&& urecord->flag & UDF_RECORD_FLAG_OPEN
			&& !as_bin_inuse_has(rd)) {
		as_index_delete(tr->rsv.tree, &tr->keyd);
		urecord->starting_memory_bytes = 0;
		*urecord_op                    = UDF_OPTYPE_DELETE;
		udf_xdr_ship_op = true;
	} else if (*urecord_op == UDF_OPTYPE_WRITE)	{
		cf_detail(AS_UDF, "Committing Changes %"PRIx64" n_bins %d", rd->keyd, as_bin_get_n_bins(r_ref->r, rd));

		size_t  rec_props_data_size = as_storage_record_rec_props_size(rd);
		uint8_t rec_props_data[rec_props_data_size];
		if (rec_props_data_size > 0) {
			as_storage_record_set_rec_props(rd, rec_props_data);
		}

		write_local_post_processing(tr, tr->rsv.ns, NULL, &urecord->pickled_buf,
			&urecord->pickled_sz, &urecord->pickled_void_time,
			&urecord->pickled_rec_props, true/*increment_generation*/,
			NULL, r_ref->r, rd, urecord->starting_memory_bytes);

		// Now ok to accommodate a new stored key...
		if (! as_index_is_flag_set(r_ref->r, AS_INDEX_FLAG_KEY_STORED) && rd->key) {
			if (rd->ns->storage_data_in_memory) {
				as_record_allocate_key(r_ref->r, rd->key, rd->key_size);
			}

			as_index_set_flags(r_ref->r, AS_INDEX_FLAG_KEY_STORED);
		}
		// ... or drop a stored key.
		else if (as_index_is_flag_set(r_ref->r, AS_INDEX_FLAG_KEY_STORED) && ! rd->key) {
			if (rd->ns->storage_data_in_memory) {
				as_record_remove_key(r_ref->r);
			}

			as_index_clear_flags(r_ref->r, AS_INDEX_FLAG_KEY_STORED);
		}
	}

	// Collect the record information (for XDR) before closing the record
	as_generation generation = 0;
	if (urecord->flag & UDF_RECORD_FLAG_OPEN) {
		generation = r_ref->r->generation;
		set_id = as_index_get_set_id(r_ref->r);
	}
	// Close the record for all the cases
	udf_record_close(urecord, false);

	// Write to XDR pipe after closing the record, in order to release the record lock as
	// early as possible.
	if (udf_xdr_ship_op == true) {
		if (UDF_OP_IS_WRITE(*urecord_op)) {
			cf_detail(AS_UDF, "UDF write shipping for key %" PRIx64, tr->keyd);
			xdr_write(tr->rsv.ns, tr->keyd, generation, 0, false, set_id);
		} else if (UDF_OP_IS_DELETE(*urecord_op)) {
			cf_detail(AS_UDF, "UDF delete shipping for key %" PRIx64, tr->keyd);
			xdr_write(tr->rsv.ns, tr->keyd, generation, 0, true, set_id);
		}
	}

	// Replication happens when the main record replicates
	if (urecord->particle_data) {
		cf_free(urecord->particle_data);
		urecord->particle_data = 0;
	}
	udf_record_cache_free(urecord);
}
int
write_replica(as_partition_reservation* rsv, cf_digest* keyd,
		uint8_t* pickled_buf, size_t pickled_sz,
		const as_rec_props* p_rec_props, as_generation generation,
		uint32_t void_time, uint64_t last_update_time, cf_node master,
		uint32_t info, ldt_prole_info* linfo)
{
	as_namespace* ns = rsv->ns;

	if (! as_storage_has_space(rsv->ns)) {
		cf_warning(AS_RW, "{%s} write_replica: drives full", ns->name);
		return AS_PROTO_RESULT_FAIL_PARTITION_OUT_OF_SPACE;
	}

	as_index_tree* tree = rsv->tree;
	bool is_subrec = false;
	bool is_ldt_parent = false;

	if (ns->ldt_enabled) {
		if ((info & RW_INFO_LDT_SUBREC) != 0 || (info & RW_INFO_LDT_ESR) != 0) {
			tree = rsv->sub_tree;
			is_subrec = true;
		}
		else if ((info & RW_INFO_LDT_PARENTREC) != 0) {
			is_ldt_parent = true;
		}
	}

	as_index_ref r_ref;
	r_ref.skip_lock = false;

	int rv = as_record_get_create(tree, keyd, &r_ref, ns, is_subrec);

	if (rv < 0) {
		cf_warning_digest(AS_RW, keyd, "{%s} write_replica: fail as_record_get_create() ", ns->name);
		return AS_PROTO_RESULT_FAIL_UNKNOWN;
	}

	as_record* r = r_ref.r;
	as_storage_rd rd;
	bool is_create = false;

	if (rv == 1) {
		as_storage_record_create(ns, r, &rd, keyd);
		is_create = true;
	}
	else {
		as_storage_record_open(ns, r, &rd, keyd);
	}

	bool has_sindex = (info & RW_INFO_SINDEX_TOUCHED) != 0;

	rd.ignore_record_on_device = ! has_sindex && ! is_ldt_parent;
	rd.n_bins = as_bin_get_n_bins(r, &rd);

	// TODO - we really need an inline utility for this!
	uint16_t newbins = ntohs(*(uint16_t*)pickled_buf);

	if (! rd.ns->storage_data_in_memory && ! rd.ns->single_bin &&
			newbins > rd.n_bins) {
		rd.n_bins = newbins;
	}

	as_bin stack_bins[rd.ns->storage_data_in_memory ? 0 : rd.n_bins];

	rd.bins = as_bin_get_all(r, &rd, stack_bins);

	uint32_t stack_particles_sz = rd.ns->storage_data_in_memory ?
			0 : as_record_buf_get_stack_particles_sz(pickled_buf);
	uint8_t stack_particles[stack_particles_sz + 256];
	uint8_t* p_stack_particles = stack_particles;
	// + 256 for LDT control bin, to hold version.

	if (! ldt_get_prole_version(rsv, keyd, linfo, info, &rd, is_create)) {
		if (is_create) {
			as_index_delete(tree, keyd);
		}

		as_storage_record_close(r, &rd);
		as_record_done(&r_ref, ns);

		return AS_PROTO_RESULT_FAIL_UNKNOWN;
	}

	uint64_t memory_bytes = 0;

	if (! is_create) {
		memory_bytes = as_storage_record_get_n_bytes_memory(&rd);
	}

	as_record_set_properties(&rd, p_rec_props);

	if (as_record_unpickle_replace(r, &rd, pickled_buf, pickled_sz,
			&p_stack_particles, has_sindex) != 0) {
		if (is_create) {
			as_index_delete(tree, keyd);
		}

		as_storage_record_close(r, &rd);
		as_record_done(&r_ref, ns);

		return AS_PROTO_RESULT_FAIL_UNKNOWN; // TODO - better granularity?
	}

	r->generation = generation;
	r->void_time = void_time;
	r->last_update_time = last_update_time;

	as_storage_record_adjust_mem_stats(&rd, memory_bytes);

	uint64_t version_to_set = 0;
	bool set_version = false;

	if (is_ldt_parent) {
		if (linfo->replication_partition_version_match &&
				linfo->ldt_prole_version_set) {
			version_to_set = linfo->ldt_prole_version;
			set_version = true;
		}
		else if (! linfo->replication_partition_version_match) {
			version_to_set = linfo->ldt_source_version;
			set_version = true;
		}
	}

	if (set_version) {
		int ldt_rv = as_ldt_parent_storage_set_version(&rd, version_to_set,
				p_stack_particles, __FILE__, __LINE__);

		if (ldt_rv < 0) {
			cf_warning(AS_LDT, "write_replica: LDT parent storage version set failed %d", ldt_rv);
			// TODO - roll back.
		}
	}

	bool is_delete = false;

	if (! as_bin_inuse_has(&rd)) {
		// A master write that deletes a record by deleting (all) bins sends a
		// binless pickle that ends up here.
		is_delete = true;
		as_index_delete(tree, keyd);
	}

	as_storage_record_write(r, &rd);
	as_storage_record_close(r, &rd);

	uint16_t set_id = as_index_get_set_id(r);

	as_record_done(&r_ref, ns);

	// Don't send an XDR delete if it's disallowed.
	if (is_delete && ! is_xdr_delete_shipping_enabled()) {
		// TODO - should we also not ship if there was no record here before?
		return AS_PROTO_RESULT_OK;
	}

	// Do XDR write if the write is a non-XDR write or forwarding is enabled.
	if ((info & RW_INFO_XDR) == 0 ||
			is_xdr_forwarding_enabled() || ns->ns_forward_xdr_writes) {
		xdr_write(ns, *keyd, generation, master, is_delete, set_id, NULL);
	}

	return AS_PROTO_RESULT_OK;
}
// Build response to batch request.
static void
batch_build_response(batch_transaction* btr, cf_buf_builder** bb_r)
{
	as_namespace* ns = btr->ns;
	batch_digests *bmds = btr->digests;
	bool get_data = btr->get_data;
	uint32_t yield_count = 0;

	for (int i = 0; i < bmds->n_digests; i++)
	{
		batch_digest *bmd = &bmds->digest[i];

		if (bmd->done == false) {
			// try to get the key
			as_partition_reservation rsv;
			AS_PARTITION_RESERVATION_INIT(rsv);
			cf_node other_node = 0;
			uint64_t cluster_key;

			if (! *bb_r) {
				*bb_r = cf_buf_builder_create_size(1024 * 4);
			}

			int rv = as_partition_reserve_read(ns, as_partition_getid(bmd->keyd), &rsv, &other_node, &cluster_key);

			if (rv == 0) {
				cf_atomic_int_incr(&g_config.batch_tree_count);

				as_index_ref r_ref;
				r_ref.skip_lock = false;
				int rec_rv = as_record_get(rsv.tree, &bmd->keyd, &r_ref, ns);

				if (rec_rv == 0) {
					as_index *r = r_ref.r;

					// Check to see this isn't an expired record waiting to die.
					if (r->void_time && r->void_time < as_record_void_time_get()) {
						as_msg_make_error_response_bufbuilder(&bmd->keyd, AS_PROTO_RESULT_FAIL_NOTFOUND, bb_r, ns->name);
					}
					else {
						// Make sure it's brought in from storage if necessary.
						as_storage_rd rd;
						if (get_data) {
							as_storage_record_open(ns, r, &rd, &r->key);
							rd.n_bins = as_bin_get_n_bins(r, &rd);
						}

						// Note: this array must stay in scope until the
						// response for this record has been built, since in the
						// get data w/ record on device case, it's copied by
						// reference directly into the record descriptor.
						as_bin stack_bins[!get_data || rd.ns->storage_data_in_memory ? 0 : rd.n_bins];

						if (get_data) {
							// Figure out which bins you want - for now, all.
							rd.bins = as_bin_get_all(r, &rd, stack_bins);
							rd.n_bins = as_bin_inuse_count(&rd);
						}

						as_msg_make_response_bufbuilder(r, (get_data ? &rd : NULL), bb_r, !get_data, (get_data ? NULL : ns->name), true, false, btr->binlist);

						if (get_data) {
							as_storage_record_close(r, &rd);
						}
					}
					as_record_done(&r_ref, ns);
				}
				else {
					// TODO - what about empty records?
					cf_debug(AS_BATCH, "batch_build_response: as_record_get returned %d : key %"PRIx64, rec_rv, *(uint64_t *)&bmd->keyd);
					as_msg_make_error_response_bufbuilder(&bmd->keyd, AS_PROTO_RESULT_FAIL_NOTFOUND, bb_r, ns->name);
				}

				bmd->done = true;

				as_partition_release(&rsv);
				cf_atomic_int_decr(&g_config.batch_tree_count);
			}
			else {
				cf_debug(AS_BATCH, "batch_build_response: partition reserve read failed: rv %d", rv);

				as_msg_make_error_response_bufbuilder(&bmd->keyd, AS_PROTO_RESULT_FAIL_NOTFOUND, bb_r, ns->name);

				if (other_node != 0) {
					bmd->node = other_node;
					cf_debug(AS_BATCH, "other_node is: %p.", other_node);
				} else {
					cf_debug(AS_BATCH, "other_node is NULL.");
				}
			}

			yield_count++;
			if (yield_count % g_config.batch_priority == 0) {
				usleep(1);
			}
		}
	}
}
/* Internal Function: Does the post processing for the UDF record after the
 *					  UDF execution. Does the following:
 *		1. Record is closed
 *		2. urecord_op is updated to delete in case there is no bin left in it.
 *		3. record->pickled_buf is populated before the record is close in case
 *		   it was write operation
 *		4. UDF updates cache is cleared
 *
 *	Returns: Nothing
 *
 *	Parameters: urecord          - UDF record to operate on
 *				urecord_op (out) - Populated with the optype
 */
static void
post_processing(udf_record *urecord, udf_optype *urecord_op, uint16_t set_id)
{
	as_storage_rd      *rd   = urecord->rd;
	as_transaction     *tr   = urecord->tr;
	as_index_ref    *r_ref   = urecord->r_ref;

	// INIT
	urecord->pickled_buf     = NULL;
	urecord->pickled_sz      = 0;
	as_rec_props_clear(&urecord->pickled_rec_props);
	bool udf_xdr_ship_op = false;

	getop(urecord, urecord_op);

	if (UDF_OP_IS_DELETE(*urecord_op)
			|| UDF_OP_IS_WRITE(*urecord_op)) {
		udf_xdr_ship_op = true;
	}

	cf_detail(AS_UDF, "FINISH working with LDT Record %p %p %p %p %d", &urecord,
			urecord->tr, urecord->r_ref, urecord->rd,
			(urecord->flag & UDF_RECORD_FLAG_STORAGE_OPEN));

	// If there exists a record reference but no bin of the record is in use,
	// delete the record. remove from the tree. Only LDT_RECORD here not needed
	// for LDT_SUBRECORD (only do it if requested by UDF). All the SUBRECORD of
	// removed LDT_RECORD will be lazily cleaned up by defrag.
	if (udf_zero_bins_left(urecord)) {
		as_transaction *tr = urecord->tr;
		as_index_delete(tr->rsv.tree, &tr->keyd);
		urecord->starting_memory_bytes = 0;
		*urecord_op                    = UDF_OPTYPE_DELETE;
	}
	else if (*urecord_op == UDF_OPTYPE_WRITE)	{
		cf_detail_digest(AS_UDF, &rd->keyd, "Committing Changes n_bins %d", as_bin_get_n_bins(r_ref->r, rd));

		size_t  rec_props_data_size = as_storage_record_rec_props_size(rd);
		uint8_t rec_props_data[rec_props_data_size];
		if (rec_props_data_size > 0) {
			as_storage_record_set_rec_props(rd, rec_props_data);
		}

		write_udf_post_processing(tr, rd, &urecord->pickled_buf,
			&urecord->pickled_sz, &urecord->pickled_rec_props,
			urecord->starting_memory_bytes);

		// Now ok to accommodate a new stored key...
		if (! as_index_is_flag_set(r_ref->r, AS_INDEX_FLAG_KEY_STORED) && rd->key) {
			if (rd->ns->storage_data_in_memory) {
				as_record_allocate_key(r_ref->r, rd->key, rd->key_size);
			}

			as_index_set_flags(r_ref->r, AS_INDEX_FLAG_KEY_STORED);
		}
		// ... or drop a stored key.
		else if (as_index_is_flag_set(r_ref->r, AS_INDEX_FLAG_KEY_STORED) && ! rd->key) {
			if (rd->ns->storage_data_in_memory) {
				as_record_remove_key(r_ref->r);
			}

			as_index_clear_flags(r_ref->r, AS_INDEX_FLAG_KEY_STORED);
		}
	}

	// Collect the record information (for XDR) before closing the record
	as_generation generation = 0;
	if (urecord->flag & UDF_RECORD_FLAG_OPEN) {
		generation = r_ref->r->generation;
		set_id = as_index_get_set_id(r_ref->r);
	}
	urecord->op = *urecord_op;
	// Close the record for all the cases
	udf_record_close(urecord);

	// Write to XDR pipe after closing the record, in order to release the record lock as
	// early as possible.
	if (udf_xdr_ship_op == true) {
		if (UDF_OP_IS_WRITE(*urecord_op)) {
			cf_detail(AS_UDF, "UDF write shipping for key %" PRIx64, tr->keyd);
			xdr_write(tr->rsv.ns, tr->keyd, generation, 0, false, set_id);
		} else if (UDF_OP_IS_DELETE(*urecord_op)) {
			cf_detail(AS_UDF, "UDF delete shipping for key %" PRIx64, tr->keyd);
			xdr_write(tr->rsv.ns, tr->keyd, generation, 0, true, set_id);
		}
	}
}