int
as_record_write_from_pickle(as_storage_rd* rd)
{
	cf_assert(as_bin_inuse_has(rd), AS_RECORD, "unexpected binless pickle");

	return as_storage_record_write(rd);
}
Exemple #2
0
as_bin *
as_bin_get_from_buf(as_storage_rd *rd, byte *name, size_t namesz)
{
	if (rd->ns->single_bin) {
		return as_bin_inuse_has(rd) ? rd->bins : NULL;
	}

	uint32_t id;

	if (! as_bin_get_id_w_len(rd->ns, name, namesz, &id)) {
		return NULL;
	}

	for (uint16_t i = 0; i < rd->n_bins; i++) {
		as_bin *b = &rd->bins[i];

		if (! as_bin_inuse(b)) {
			break;
		}

		if ((uint32_t)b->id == id) {
			return b;
		}
	}

	return NULL;
}
Exemple #3
0
int32_t
as_bin_get_index(as_storage_rd *rd, byte *name, size_t namesz)
{
	if (rd->ns->single_bin) {
		return as_bin_inuse_has(rd) ? 0 : -1;
	}

	uint32_t id;

	if (! as_bin_get_id_from_name_buf(rd->ns, name, namesz, &id)) {
		return -1;
	}

	for (uint16_t i = 0; i < rd->n_bins; i++) {
		as_bin *b = &rd->bins[i];

		if (! as_bin_inuse(b)) {
			break;
		}

		if ((uint32_t)b->id == id) {
			return (int32_t)i;
		}
	}

	return -1;
}
Exemple #4
0
int32_t
as_bin_get_index(as_storage_rd *rd, const char *name)
{
	if (rd->ns->single_bin) {
		return as_bin_inuse_has(rd) ? 0 : -1;
	}

	uint32_t id;

	if (cf_vmapx_get_index(rd->ns->p_bin_name_vmap, name, &id) != CF_VMAPX_OK) {
		return -1;
	}

	for (uint16_t i = 0; i < rd->n_bins; i++) {
		as_bin *b = &rd->bins[i];

		if (! as_bin_inuse(b)) {
			break;
		}

		if ((uint32_t)b->id == id) {
			return (int32_t)i;
		}
	}

	return -1;
}
static inline bool
udf_zero_bins_left(udf_record *urecord)
{
	if (!(urecord->flag & UDF_RECORD_FLAG_IS_SUBRECORD)
			&& (urecord->flag & UDF_RECORD_FLAG_OPEN)
			&& !as_bin_inuse_has(urecord->rd)) {
		return true;
	} else {
		return false;
	}
}
Exemple #6
0
// Does not check bin name length.
// Checks bin name quota - use appropriately.
as_bin *
as_bin_get_or_create(as_storage_rd *rd, const char *name)
{
	if (rd->ns->single_bin) {
		if (! as_bin_inuse_has(rd)) {
			as_bin_init_nameless(rd->bins);
		}

		return rd->bins;
	}

	uint32_t id = (uint32_t)-1;
	uint16_t i;
	as_bin *b;

	if (cf_vmapx_get_index(rd->ns->p_bin_name_vmap, name, &id) == CF_VMAPX_OK) {
		for (i = 0; i < rd->n_bins; i++) {
			b = &rd->bins[i];

			if (! as_bin_inuse(b)) {
				break;
			}

			if ((uint32_t)b->id == id) {
				return b;
			}
		}
	}
	else {
		if (cf_vmapx_count(rd->ns->p_bin_name_vmap) >= BIN_NAMES_QUOTA) {
			cf_warning(AS_BIN, "{%s} bin-name quota full - can't add new bin-name %s", rd->ns->name, name);
			return NULL;
		}

		i = as_bin_inuse_count(rd);
	}

	if (i >= rd->n_bins) {
		cf_crash(AS_BIN, "ran out of allocated bins in rd");
	}

	b = &rd->bins[i];

	if (id == (uint32_t)-1) {
		as_bin_init(rd->ns, b, name);
	}
	else {
		as_bin_init_nameless(b);
		b->id = (uint16_t)id;
	}

	return b;
}
Exemple #7
0
as_bin *
as_bin_get(as_storage_rd *rd, const char *name)
{
	if (rd->ns->single_bin) {
		return as_bin_inuse_has(rd) ? rd->bins : NULL;
	}

	uint32_t id;

	if (cf_vmapx_get_index(rd->ns->p_bin_name_vmap, name, &id) != CF_VMAPX_OK) {
		return NULL;
	}

	return as_bin_get_by_id(rd, id);
}
Exemple #8
0
as_bin *
as_bin_get_and_reserve_name(as_storage_rd *rd, byte *name, size_t namesz,
		bool *p_reserved, uint32_t *p_idx)
{
	*p_reserved = true;

	if (rd->ns->single_bin) {
		return as_bin_inuse_has(rd) ? rd->bins : NULL;
	}

	char zname[namesz + 1];

	memcpy(zname, name, namesz);
	zname[namesz] = 0;

	if (cf_vmapx_get_index(rd->ns->p_bin_name_vmap, zname, p_idx) != CF_VMAPX_OK) {
		if (cf_vmapx_count(rd->ns->p_bin_name_vmap) >= BIN_NAMES_QUOTA) {
			cf_warning(AS_BIN, "{%s} bin-name quota full - can't add new bin-name %s", rd->ns->name, zname);
			*p_reserved = false;
		}
		else {
			cf_vmapx_err result = cf_vmapx_put_unique(rd->ns->p_bin_name_vmap, zname, p_idx);

			if (! (result == CF_VMAPX_OK || result == CF_VMAPX_ERR_NAME_EXISTS)) {
				cf_warning(AS_BIN, "{%s} can't add new bin name %s, vmap err %d", rd->ns->name, zname, result);
				*p_reserved = false;
			}
		}

		return NULL;
	}

	for (uint16_t i = 0; i < rd->n_bins; i++) {
		as_bin *b = &rd->bins[i];

		if (! as_bin_inuse(b)) {
			break;
		}

		if ((uint32_t)b->id == *p_idx) {
			return b;
		}
	}

	return NULL;
}
Exemple #9
0
transaction_status
read_local(as_transaction* tr)
{
	as_msg* m = &tr->msgp->msg;
	as_namespace* ns = tr->rsv.ns;

	as_index_ref r_ref;

	if (as_record_get(tr->rsv.tree, &tr->keyd, &r_ref) != 0) {
		read_local_done(tr, NULL, NULL, AS_ERR_NOT_FOUND);
		return TRANS_DONE_ERROR;
	}

	as_record* r = r_ref.r;

	// Check if it's an expired or truncated record.
	if (as_record_is_doomed(r, ns)) {
		read_local_done(tr, &r_ref, NULL, AS_ERR_NOT_FOUND);
		return TRANS_DONE_ERROR;
	}

	int result = repl_state_check(r, tr);

	if (result != 0) {
		if (result == -3) {
			read_local_done(tr, &r_ref, NULL, AS_ERR_UNAVAILABLE);
			return TRANS_DONE_ERROR;
		}

		// No response sent to origin.
		as_record_done(&r_ref, ns);
		return result == 1 ? TRANS_IN_PROGRESS : TRANS_WAITING;
	}

	// Check if it's a tombstone.
	if (! as_record_is_live(r)) {
		read_local_done(tr, &r_ref, NULL, AS_ERR_NOT_FOUND);
		return TRANS_DONE_ERROR;
	}

	as_storage_rd rd;

	as_storage_record_open(ns, r, &rd);

	// If configuration permits, allow reads to use page cache.
	rd.read_page_cache = ns->storage_read_page_cache;

	// Check the key if required.
	// Note - for data-not-in-memory "exists" ops, key check is expensive!
	if (as_transaction_has_key(tr) &&
			as_storage_record_get_key(&rd) && ! check_msg_key(m, &rd)) {
		read_local_done(tr, &r_ref, &rd, AS_ERR_KEY_MISMATCH);
		return TRANS_DONE_ERROR;
	}

	if ((m->info1 & AS_MSG_INFO1_GET_NO_BINS) != 0) {
		tr->generation = r->generation;
		tr->void_time = r->void_time;
		tr->last_update_time = r->last_update_time;

		read_local_done(tr, &r_ref, &rd, AS_OK);
		return TRANS_DONE_SUCCESS;
	}

	if ((result = as_storage_rd_load_n_bins(&rd)) < 0) {
		cf_warning_digest(AS_RW, &tr->keyd, "{%s} read_local: failed as_storage_rd_load_n_bins() ", ns->name);
		read_local_done(tr, &r_ref, &rd, -result);
		return TRANS_DONE_ERROR;
	}

	as_bin stack_bins[ns->storage_data_in_memory ? 0 : rd.n_bins];

	if ((result = as_storage_rd_load_bins(&rd, stack_bins)) < 0) {
		cf_warning_digest(AS_RW, &tr->keyd, "{%s} read_local: failed as_storage_rd_load_bins() ", ns->name);
		read_local_done(tr, &r_ref, &rd, -result);
		return TRANS_DONE_ERROR;
	}

	if (! as_bin_inuse_has(&rd)) {
		cf_warning_digest(AS_RW, &tr->keyd, "{%s} read_local: found record with no bins ", ns->name);
		read_local_done(tr, &r_ref, &rd, AS_ERR_UNKNOWN);
		return TRANS_DONE_ERROR;
	}

	uint32_t bin_count = (m->info1 & AS_MSG_INFO1_GET_ALL) != 0 ?
			rd.n_bins : m->n_ops;

	as_msg_op* ops[bin_count];
	as_msg_op** p_ops = ops;
	as_bin* response_bins[bin_count];
	uint16_t n_bins = 0;

	as_bin result_bins[bin_count];
	uint32_t n_result_bins = 0;

	if ((m->info1 & AS_MSG_INFO1_GET_ALL) != 0) {
		p_ops = NULL;
		n_bins = rd.n_bins;
		as_bin_get_all_p(&rd, response_bins);
	}
	else {
		if (m->n_ops == 0) {
			cf_warning_digest(AS_RW, &tr->keyd, "{%s} read_local: bin op(s) expected, none present ", ns->name);
			read_local_done(tr, &r_ref, &rd, AS_ERR_PARAMETER);
			return TRANS_DONE_ERROR;
		}

		bool respond_all_ops = (m->info2 & AS_MSG_INFO2_RESPOND_ALL_OPS) != 0;

		as_msg_op* op = 0;
		int n = 0;

		while ((op = as_msg_op_iterate(m, op, &n)) != NULL) {
			if (op->op == AS_MSG_OP_READ) {
				as_bin* b = as_bin_get_from_buf(&rd, op->name, op->name_sz);

				if (b || respond_all_ops) {
					ops[n_bins] = op;
					response_bins[n_bins++] = b;
				}
			}
			else if (op->op == AS_MSG_OP_CDT_READ) {
				as_bin* b = as_bin_get_from_buf(&rd, op->name, op->name_sz);

				if (b) {
					as_bin* rb = &result_bins[n_result_bins];
					as_bin_set_empty(rb);

					if ((result = as_bin_cdt_read_from_client(b, op, rb)) < 0) {
						cf_warning_digest(AS_RW, &tr->keyd, "{%s} read_local: failed as_bin_cdt_read_from_client() ", ns->name);
						destroy_stack_bins(result_bins, n_result_bins);
						read_local_done(tr, &r_ref, &rd, -result);
						return TRANS_DONE_ERROR;
					}

					if (as_bin_inuse(rb)) {
						n_result_bins++;
						ops[n_bins] = op;
						response_bins[n_bins++] = rb;
					}
					else if (respond_all_ops) {
						ops[n_bins] = op;
						response_bins[n_bins++] = NULL;
					}
				}
				else if (respond_all_ops) {
					ops[n_bins] = op;
					response_bins[n_bins++] = NULL;
				}
			}
			else {
				cf_warning_digest(AS_RW, &tr->keyd, "{%s} read_local: unexpected bin op %u ", ns->name, op->op);
				destroy_stack_bins(result_bins, n_result_bins);
				read_local_done(tr, &r_ref, &rd, AS_ERR_PARAMETER);
				return TRANS_DONE_ERROR;
			}
		}
	}

	cf_dyn_buf_define_size(db, 16 * 1024);

	if (tr->origin != FROM_BATCH) {
		db.used_sz = db.alloc_sz;
		db.buf = (uint8_t*)as_msg_make_response_msg(tr->result_code,
				r->generation, r->void_time, p_ops, response_bins, n_bins, ns,
				(cl_msg*)dyn_bufdb, &db.used_sz, as_transaction_trid(tr));

		db.is_stack = db.buf == dyn_bufdb;
		// Note - not bothering to correct alloc_sz if buf was allocated.
	}
	else {
		tr->generation = r->generation;
		tr->void_time = r->void_time;
		tr->last_update_time = r->last_update_time;

		// Since as_batch_add_result() constructs response directly in shared
		// buffer to avoid extra copies, can't use db.
		send_read_response(tr, p_ops, response_bins, n_bins, NULL);
	}

	destroy_stack_bins(result_bins, n_result_bins);
	as_storage_record_close(&rd);
	as_record_done(&r_ref, ns);

	// Now that we're not under the record lock, send the message we just built.
	if (db.used_sz != 0) {
		send_read_response(tr, NULL, NULL, 0, &db);

		cf_dyn_buf_free(&db);
		tr->from.proto_fd_h = NULL;
	}

	return TRANS_DONE_SUCCESS;
}
/**
 * aerospike::create(record)
 * Function: udf_aerospike_rec_create
 *
 * Parameters:
 * 		as - as_aerospike
 *		rec - as_rec
 *
 * Return Values:
 * 		1 if record is being read or on a create, it already exists
 * 		o/w return value of udf_aerospike__execute_updates
 *
 * Description:
 * 		Create a new record in local storage.
 * 		The record will only be created if it does not exist.
 * 		This assumes the record has a digest that is valid for local storage.
 *
 *		Synchronization : object lock acquired by the transaction thread executing UDF.
 * 		Partition reservation takes place just before the transaction starts executing
 * 		( look for as_partition_reserve_udf in thr_tsvc.c )
 *
 * 		Callers:
 * 		lua interfacing function, mod_lua_aerospike_rec_create
 * 		The return value of udf_aerospike_rec_create is pushed on to the lua stack
 *
 * 		Notes:
 * 		The 'read' and 'exists' flag of udf_record are set to true.
*/
static int
udf_aerospike_rec_create(const as_aerospike * as, const as_rec * rec)
{
	int ret = udf_aerospike_param_check(as, rec, __FILE__, __LINE__);
	if (ret) {
		return ret;
	}

	udf_record * urecord  = (udf_record *) as_rec_source(rec);

	// make sure record isn't already successfully read
	if (urecord->flag & UDF_RECORD_FLAG_OPEN) {
		cf_detail(AS_UDF, "udf_aerospike_rec_create: Record Already Exists");
		return 1;
	}
	as_transaction *tr    = urecord->tr;
	as_index_ref   *r_ref = urecord->r_ref;
	as_storage_rd  *rd    = urecord->rd;
	as_index_tree  *tree  = tr->rsv.tree;

	if (urecord->flag & UDF_RECORD_FLAG_IS_SUBRECORD) {
		tree      = tr->rsv.sub_tree;
	}

	// make sure we got the record as a create
	int rv = as_record_get_create(tree, &tr->keyd, r_ref, tr->rsv.ns);
	cf_detail_digest(AS_UDF, &tr->keyd, "Creating %sRecord",
			(urecord->flag & UDF_RECORD_FLAG_IS_SUBRECORD) ? "Sub" : "");

	// rv 0 means record exists, 1 means create, < 0 means fail
	// TODO: Verify correct result codes.
	if (rv == 0) {
		cf_warning(AS_UDF, "udf_aerospike_rec_create: Record Already Exists 2");
		as_record_done(r_ref, tr->rsv.ns);
		bzero(r_ref, sizeof(as_index_ref));
		return 1;
	} else if (rv < 0) {
		cf_warning(AS_UDF, "udf_aerospike_rec_create: Record Open Failed with rv=%d", rv);
		return rv;
	}

	// Associates the set name with the storage rec and index
	if(tr->msgp) {
		// Set the set name to index and close record if the setting the set name
		// is not successful
		int rv_set = as_record_set_set_from_msg(r_ref->r, tr->rsv.ns, &tr->msgp->msg);
		if (rv_set != 0) {
			cf_warning(AS_UDF, "udf_aerospike_rec_create: Failed to set setname");
			as_record_done(r_ref, tr->rsv.ns);
			// TODO bzero is expensive. Switch to use flag.
			bzero(r_ref, sizeof(as_index_ref));
			return 4;
		}
	}

	urecord->flag |= UDF_RECORD_FLAG_OPEN;
	cf_detail(AS_UDF, "Open %p %x %"PRIx64"", urecord, urecord->flag, *(uint64_t *)&tr->keyd);

	as_index *r    = r_ref->r;
	// open up storage
	as_storage_record_create(urecord->tr->rsv.ns, urecord->r_ref->r,
		urecord->rd, &urecord->tr->keyd);

	cf_detail(AS_UDF, "as_storage_record_create: udf_aerospike_rec_create: r %p rd %p",
		urecord->r_ref->r, urecord->rd);

	// if multibin storage, we will use urecord->stack_bins, so set the size appropriately
	if ( ! rd->ns->storage_data_in_memory && ! rd->ns->single_bin ) {
		rd->n_bins = sizeof(urecord->stack_bins) / sizeof(as_bin);
	}

	// side effect: will set the unused bins to properly unused
	rd->bins       = as_bin_get_all(r, rd, urecord->stack_bins);
	urecord->flag |= UDF_RECORD_FLAG_STORAGE_OPEN;

	// If the message has a key, apply it to the record.
	as_msg_field* f = as_msg_field_get(&tr->msgp->msg, AS_MSG_FIELD_TYPE_KEY);
	if (f) {
		rd->key_size = as_msg_field_get_value_sz(f);
		rd->key = f->data;
	}

	cf_detail(AS_UDF, "Storage Open %p %x %"PRIx64"", urecord, urecord->flag, *(uint64_t *)&tr->keyd);
	cf_detail(AS_UDF, "udf_aerospike_rec_create: Record created %d", urecord->flag);

	int rc         = udf_aerospike__execute_updates(urecord);
	if(rc) {
		//  Creating the udf record failed, destroy the as_record
		if (!as_bin_inuse_has(urecord->rd)) {
			udf_aerospike_rec_remove(as, rec);
		}
	}
	return rc;
}
Exemple #11
0
// Does not check bin name length.
// Checks bin name quota and bin-level policy - use appropriately.
as_bin *
as_bin_get_or_create_from_buf(as_storage_rd *rd, byte *name, size_t namesz,
		bool create_only, bool replace_only, int *p_result)
{
	if (rd->ns->single_bin) {
		if (! as_bin_inuse_has(rd)) {
			as_bin_init_nameless(rd->bins);
		}

		// Ignored bin-level policy - single-bin needs only record-level policy.
		return rd->bins;
	}

	uint32_t id = (uint32_t)-1;
	uint16_t i;
	as_bin *b;

	if (cf_vmapx_get_index_w_len(rd->ns->p_bin_name_vmap, (const char *)name, namesz, &id) == CF_VMAPX_OK) {
		for (i = 0; i < rd->n_bins; i++) {
			b = &rd->bins[i];

			if (! as_bin_inuse(b)) {
				break;
			}

			if ((uint32_t)b->id == id) {
				if (as_bin_is_hidden(b)) {
					cf_warning(AS_BIN, "cannot manipulate hidden bin directly");
					*p_result = AS_PROTO_RESULT_FAIL_INCOMPATIBLE_TYPE;
					return NULL;
				}

				if (create_only) {
					*p_result = AS_PROTO_RESULT_FAIL_BIN_EXISTS;
					return NULL;
				}

				return b;
			}
		}
	}
	else {
		if (cf_vmapx_count(rd->ns->p_bin_name_vmap) >= BIN_NAMES_QUOTA) {
			char zname[namesz + 1];

			memcpy(zname, name, namesz);
			zname[namesz] = 0;

			cf_warning(AS_BIN, "{%s} bin-name quota full - can't add new bin-name %s", rd->ns->name, zname);
			*p_result = AS_PROTO_RESULT_FAIL_BIN_NAME;
			return NULL;
		}

		i = as_bin_inuse_count(rd);
	}

	if (replace_only) {
		*p_result = AS_PROTO_RESULT_FAIL_BIN_NOT_FOUND;
		return NULL;
	}

	if (i >= rd->n_bins) {
		cf_crash(AS_BIN, "ran out of allocated bins in rd");
	}

	b = &rd->bins[i];

	if (id == (uint32_t)-1) {
		as_bin_init_w_len(rd->ns, b, name, namesz);
	}
	else {
		as_bin_init_nameless(b);
		b->id = (uint16_t)id;
	}

	return b;
}
Exemple #12
0
/* Internal Function: Does the post processing for the UDF record after the
 *					  UDF execution. Does the following:
 *		1. Record is closed
 *		2. urecord_op is updated to delete in case there is no bin left in it.
 *		3. record->pickled_buf is populated before the record is close in case
 *		   it was write operation
 *		4. UDF updates cache is cleared
 *
 *	Returns: Nothing
 *
 *	Parameters: urecord          - UDF record to operate on
 *				urecord_op (out) - Populated with the optype
 */
void
udf_rw_post_processing(udf_record *urecord, udf_optype *urecord_op, uint16_t set_id)
{
	as_storage_rd      *rd   = urecord->rd;
	as_transaction     *tr   = urecord->tr;
	as_index_ref    *r_ref   = urecord->r_ref;

	// INIT
	urecord->pickled_buf     = NULL;
	urecord->pickled_sz      = 0;
	urecord->pickled_void_time     = 0;
	as_rec_props_clear(&urecord->pickled_rec_props);
	bool udf_xdr_ship_op = false;

	// TODO: optimize not to allocate buffer if it is single
	// node cluster. No remote to send data to
	// Check if UDF has updates.
	if (urecord->flag & UDF_RECORD_FLAG_HAS_UPDATES) {
		// Check if the record is not deleted after an update
		if ( urecord->flag & UDF_RECORD_FLAG_OPEN) {
			*urecord_op = UDF_OPTYPE_WRITE;
			udf_xdr_ship_op = true;
		} 
		else {
			// If the record has updates and it is not open, 
			// and if it pre-existed it's an update followed by a delete.
			if ( urecord->flag & UDF_RECORD_FLAG_PREEXISTS) {
				*urecord_op = UDF_OPTYPE_DELETE;
				udf_xdr_ship_op = true;
			} 
			// If the record did not pre-exist and is updated
			// and it is not open, then it is create followed by
			// delete essentially no_op.
			else {
				*urecord_op = UDF_OPTYPE_NONE;
			}
		}
	} else if ((urecord->flag & UDF_RECORD_FLAG_PREEXISTS)
			   && !(urecord->flag & UDF_RECORD_FLAG_OPEN)) {
		*urecord_op  = UDF_OPTYPE_DELETE;
		udf_xdr_ship_op = true;
	} else {
		*urecord_op  = UDF_OPTYPE_READ;
	}

	cf_detail(AS_UDF, "FINISH working with LDT Record %p %p %p %p %d", &urecord,
			urecord->tr, urecord->r_ref, urecord->rd,
			(urecord->flag & UDF_RECORD_FLAG_STORAGE_OPEN));

	// If there exists a record reference but no bin of the record is in use,
	// delete the record. remove from the tree. Only LDT_RECORD here not needed
	// for LDT_SUBRECORD (only do it if requested by UDF). All the SUBRECORD of
	// removed LDT_RECORD will be lazily cleaned up by defrag.
	if (!(urecord->flag & UDF_RECORD_FLAG_IS_SUBRECORD)
			&& urecord->flag & UDF_RECORD_FLAG_OPEN
			&& !as_bin_inuse_has(rd)) {
		as_index_delete(tr->rsv.tree, &tr->keyd);
		urecord->starting_memory_bytes = 0;
		*urecord_op                    = UDF_OPTYPE_DELETE;
		udf_xdr_ship_op = true;
	} else if (*urecord_op == UDF_OPTYPE_WRITE)	{
		cf_detail(AS_UDF, "Committing Changes %"PRIx64" n_bins %d", rd->keyd, as_bin_get_n_bins(r_ref->r, rd));

		size_t  rec_props_data_size = as_storage_record_rec_props_size(rd);
		uint8_t rec_props_data[rec_props_data_size];
		if (rec_props_data_size > 0) {
			as_storage_record_set_rec_props(rd, rec_props_data);
		}

		write_local_post_processing(tr, tr->rsv.ns, NULL, &urecord->pickled_buf,
			&urecord->pickled_sz, &urecord->pickled_void_time,
			&urecord->pickled_rec_props, true/*increment_generation*/,
			NULL, r_ref->r, rd, urecord->starting_memory_bytes);

		// Now ok to accommodate a new stored key...
		if (! as_index_is_flag_set(r_ref->r, AS_INDEX_FLAG_KEY_STORED) && rd->key) {
			if (rd->ns->storage_data_in_memory) {
				as_record_allocate_key(r_ref->r, rd->key, rd->key_size);
			}

			as_index_set_flags(r_ref->r, AS_INDEX_FLAG_KEY_STORED);
		}
		// ... or drop a stored key.
		else if (as_index_is_flag_set(r_ref->r, AS_INDEX_FLAG_KEY_STORED) && ! rd->key) {
			if (rd->ns->storage_data_in_memory) {
				as_record_remove_key(r_ref->r);
			}

			as_index_clear_flags(r_ref->r, AS_INDEX_FLAG_KEY_STORED);
		}
	}

	// Collect the record information (for XDR) before closing the record
	as_generation generation = 0;
	if (urecord->flag & UDF_RECORD_FLAG_OPEN) {
		generation = r_ref->r->generation;
		set_id = as_index_get_set_id(r_ref->r);
	}
	// Close the record for all the cases
	udf_record_close(urecord, false);

	// Write to XDR pipe after closing the record, in order to release the record lock as
	// early as possible.
	if (udf_xdr_ship_op == true) {
		if (UDF_OP_IS_WRITE(*urecord_op)) {
			cf_detail(AS_UDF, "UDF write shipping for key %" PRIx64, tr->keyd);
			xdr_write(tr->rsv.ns, tr->keyd, generation, 0, false, set_id);
		} else if (UDF_OP_IS_DELETE(*urecord_op)) {
			cf_detail(AS_UDF, "UDF delete shipping for key %" PRIx64, tr->keyd);
			xdr_write(tr->rsv.ns, tr->keyd, generation, 0, true, set_id);
		}
	}

	// Replication happens when the main record replicates
	if (urecord->particle_data) {
		cf_free(urecord->particle_data);
		urecord->particle_data = 0;
	}
	udf_record_cache_free(urecord);
}
/**
 * aerospike::create(record)
 * Function: udf_aerospike_rec_create
 *
 * Parameters:
 * 		as - as_aerospike
 *		rec - as_rec
 *
 * Return Values:
 * 		1 if record is being read or on a create, it already exists
 * 		o/w return value of udf_aerospike__execute_updates
 *
 * Description:
 * 		Create a new record in local storage.
 * 		The record will only be created if it does not exist.
 * 		This assumes the record has a digest that is valid for local storage.
 *
 *		Synchronization : object lock acquired by the transaction thread executing UDF.
 * 		Partition reservation takes place just before the transaction starts executing
 * 		( look for as_partition_reserve_udf in thr_tsvc.c )
 *
 * 		Callers:
 * 		lua interfacing function, mod_lua_aerospike_rec_create
 * 		The return value of udf_aerospike_rec_create is pushed on to the lua stack
 *
 * 		Notes:
 * 		The 'read' and 'exists' flag of udf_record are set to true.
*/
static int
udf_aerospike_rec_create(const as_aerospike * as, const as_rec * rec)
{
	int ret = udf_aerospike_param_check(as, rec, __FILE__, __LINE__);
	if (ret) {
		return ret;
	}

	udf_record * urecord  = (udf_record *) as_rec_source(rec);

	// make sure record isn't already successfully read
	if (urecord->flag & UDF_RECORD_FLAG_OPEN) {
		cf_detail(AS_UDF, "udf_aerospike_rec_create: Record Already Exists");
		return 1;
	}
	as_transaction *tr    = urecord->tr;
	as_index_ref   *r_ref = urecord->r_ref;
	as_storage_rd  *rd    = urecord->rd;
	as_index_tree  *tree  = tr->rsv.tree;
	bool is_subrec        = false;

	if (urecord->flag & UDF_RECORD_FLAG_IS_SUBRECORD) {
		tree      = tr->rsv.sub_tree;
		is_subrec = true;
	}

	// make sure we got the record as a create
	bool is_create = false;
	int rv = as_record_get_create(tree, &tr->keyd, r_ref, tr->rsv.ns, is_subrec);
	cf_detail_digest(AS_UDF, &tr->keyd, "Creating %sRecord",
			(urecord->flag & UDF_RECORD_FLAG_IS_SUBRECORD) ? "Sub" : "");

	// rv 0 means record exists, 1 means create, < 0 means fail
	// TODO: Verify correct result codes.
	if (rv == 1) {
		is_create = true;
	} else if (rv == 0) {
		// If it's an expired record, pretend it's a fresh create.
		if (as_record_is_expired(r_ref->r)) {
			as_record_destroy(r_ref->r, tr->rsv.ns);
			as_record_initialize(r_ref, tr->rsv.ns);
			cf_atomic_int_incr(&tr->rsv.ns->n_objects);
			is_create = true;
		} else {
			cf_warning(AS_UDF, "udf_aerospike_rec_create: Record Already Exists 2");
			as_record_done(r_ref, tr->rsv.ns);
			// DO NOT change it has special meaning for caller
			return 1;
		}
	} else if (rv < 0) {
		cf_warning(AS_UDF, "udf_aerospike_rec_create: Record Open Failed with rv=%d", rv);
		return rv;
	}

	// Associates the set name with the storage rec and index
	if (tr->msgp) {
		// Set the set name to index and close record if the setting the set name
		// is not successful
		int rv_set = as_transaction_has_set(tr) ?
				as_record_set_set_from_msg(r_ref->r, tr->rsv.ns, &tr->msgp->msg) : 0;
		if (rv_set != 0) {
			cf_warning(AS_UDF, "udf_aerospike_rec_create: Failed to set setname");
			if (is_create) {
				as_index_delete(tree, &tr->keyd);
			}
			as_record_done(r_ref, tr->rsv.ns);
			return 4;
		}
	}

	urecord->flag |= UDF_RECORD_FLAG_OPEN;
	cf_detail(AS_UDF, "Open %p %x %"PRIx64"", urecord, urecord->flag, *(uint64_t *)&tr->keyd);

	as_index *r    = r_ref->r;
	// open up storage
	as_storage_record_create(urecord->tr->rsv.ns, urecord->r_ref->r,
		urecord->rd, &urecord->tr->keyd);

	cf_detail(AS_UDF, "as_storage_record_create: udf_aerospike_rec_create: r %p rd %p",
		urecord->r_ref->r, urecord->rd);

	// If the message has a key, apply it to the record.
	if (! get_msg_key(tr, rd)) {
		cf_warning(AS_UDF, "udf_aerospike_rec_create: Can't store key");
		if (is_create) {
			as_index_delete(tree, &tr->keyd);
		}
		as_record_done(r_ref, tr->rsv.ns);
		urecord->flag &= ~UDF_RECORD_FLAG_OPEN;
		return 4;
	}

	// if multibin storage, we will use urecord->stack_bins, so set the size appropriately
	if ( ! rd->ns->storage_data_in_memory && ! rd->ns->single_bin ) {
		rd->n_bins = sizeof(urecord->stack_bins) / sizeof(as_bin);
	}

	// side effect: will set the unused bins to properly unused
	rd->bins       = as_bin_get_all(r, rd, urecord->stack_bins);
	urecord->flag |= UDF_RECORD_FLAG_STORAGE_OPEN;

	cf_detail(AS_UDF, "Storage Open %p %x %"PRIx64"", urecord, urecord->flag, *(uint64_t *)&tr->keyd);
	cf_detail(AS_UDF, "udf_aerospike_rec_create: Record created %d", urecord->flag);

	int rc         = udf_aerospike__execute_updates(urecord);
	if (rc) {
		//  Creating the udf record failed, destroy the as_record
		cf_warning(AS_UDF, "udf_aerospike_rec_create: failure executing record updates (%d)", rc);
		if (!as_bin_inuse_has(urecord->rd)) {
			udf_aerospike_rec_remove(as, rec);
		}
	}
	return rc;
}
int
write_replica(as_partition_reservation* rsv, cf_digest* keyd,
		uint8_t* pickled_buf, size_t pickled_sz,
		const as_rec_props* p_rec_props, as_generation generation,
		uint32_t void_time, uint64_t last_update_time, cf_node master,
		uint32_t info, ldt_prole_info* linfo)
{
	as_namespace* ns = rsv->ns;

	if (! as_storage_has_space(rsv->ns)) {
		cf_warning(AS_RW, "{%s} write_replica: drives full", ns->name);
		return AS_PROTO_RESULT_FAIL_PARTITION_OUT_OF_SPACE;
	}

	as_index_tree* tree = rsv->tree;
	bool is_subrec = false;
	bool is_ldt_parent = false;

	if (ns->ldt_enabled) {
		if ((info & RW_INFO_LDT_SUBREC) != 0 || (info & RW_INFO_LDT_ESR) != 0) {
			tree = rsv->sub_tree;
			is_subrec = true;
		}
		else if ((info & RW_INFO_LDT_PARENTREC) != 0) {
			is_ldt_parent = true;
		}
	}

	as_index_ref r_ref;
	r_ref.skip_lock = false;

	int rv = as_record_get_create(tree, keyd, &r_ref, ns, is_subrec);

	if (rv < 0) {
		cf_warning_digest(AS_RW, keyd, "{%s} write_replica: fail as_record_get_create() ", ns->name);
		return AS_PROTO_RESULT_FAIL_UNKNOWN;
	}

	as_record* r = r_ref.r;
	as_storage_rd rd;
	bool is_create = false;

	if (rv == 1) {
		as_storage_record_create(ns, r, &rd, keyd);
		is_create = true;
	}
	else {
		as_storage_record_open(ns, r, &rd, keyd);
	}

	bool has_sindex = (info & RW_INFO_SINDEX_TOUCHED) != 0;

	rd.ignore_record_on_device = ! has_sindex && ! is_ldt_parent;
	rd.n_bins = as_bin_get_n_bins(r, &rd);

	// TODO - we really need an inline utility for this!
	uint16_t newbins = ntohs(*(uint16_t*)pickled_buf);

	if (! rd.ns->storage_data_in_memory && ! rd.ns->single_bin &&
			newbins > rd.n_bins) {
		rd.n_bins = newbins;
	}

	as_bin stack_bins[rd.ns->storage_data_in_memory ? 0 : rd.n_bins];

	rd.bins = as_bin_get_all(r, &rd, stack_bins);

	uint32_t stack_particles_sz = rd.ns->storage_data_in_memory ?
			0 : as_record_buf_get_stack_particles_sz(pickled_buf);
	uint8_t stack_particles[stack_particles_sz + 256];
	uint8_t* p_stack_particles = stack_particles;
	// + 256 for LDT control bin, to hold version.

	if (! ldt_get_prole_version(rsv, keyd, linfo, info, &rd, is_create)) {
		if (is_create) {
			as_index_delete(tree, keyd);
		}

		as_storage_record_close(r, &rd);
		as_record_done(&r_ref, ns);

		return AS_PROTO_RESULT_FAIL_UNKNOWN;
	}

	uint64_t memory_bytes = 0;

	if (! is_create) {
		memory_bytes = as_storage_record_get_n_bytes_memory(&rd);
	}

	as_record_set_properties(&rd, p_rec_props);

	if (as_record_unpickle_replace(r, &rd, pickled_buf, pickled_sz,
			&p_stack_particles, has_sindex) != 0) {
		if (is_create) {
			as_index_delete(tree, keyd);
		}

		as_storage_record_close(r, &rd);
		as_record_done(&r_ref, ns);

		return AS_PROTO_RESULT_FAIL_UNKNOWN; // TODO - better granularity?
	}

	r->generation = generation;
	r->void_time = void_time;
	r->last_update_time = last_update_time;

	as_storage_record_adjust_mem_stats(&rd, memory_bytes);

	uint64_t version_to_set = 0;
	bool set_version = false;

	if (is_ldt_parent) {
		if (linfo->replication_partition_version_match &&
				linfo->ldt_prole_version_set) {
			version_to_set = linfo->ldt_prole_version;
			set_version = true;
		}
		else if (! linfo->replication_partition_version_match) {
			version_to_set = linfo->ldt_source_version;
			set_version = true;
		}
	}

	if (set_version) {
		int ldt_rv = as_ldt_parent_storage_set_version(&rd, version_to_set,
				p_stack_particles, __FILE__, __LINE__);

		if (ldt_rv < 0) {
			cf_warning(AS_LDT, "write_replica: LDT parent storage version set failed %d", ldt_rv);
			// TODO - roll back.
		}
	}

	bool is_delete = false;

	if (! as_bin_inuse_has(&rd)) {
		// A master write that deletes a record by deleting (all) bins sends a
		// binless pickle that ends up here.
		is_delete = true;
		as_index_delete(tree, keyd);
	}

	as_storage_record_write(r, &rd);
	as_storage_record_close(r, &rd);

	uint16_t set_id = as_index_get_set_id(r);

	as_record_done(&r_ref, ns);

	// Don't send an XDR delete if it's disallowed.
	if (is_delete && ! is_xdr_delete_shipping_enabled()) {
		// TODO - should we also not ship if there was no record here before?
		return AS_PROTO_RESULT_OK;
	}

	// Do XDR write if the write is a non-XDR write or forwarding is enabled.
	if ((info & RW_INFO_XDR) == 0 ||
			is_xdr_forwarding_enabled() || ns->ns_forward_xdr_writes) {
		xdr_write(ns, *keyd, generation, master, is_delete, set_id, NULL);
	}

	return AS_PROTO_RESULT_OK;
}