Beispiel #1
0
// TODO - old pickle - remove in "six months".
int
old_record_apply_ssd_single_bin(as_remote_record *rr, as_storage_rd *rd,
		bool *is_delete)
{
	as_namespace* ns = rr->rsv->ns;
	as_record* r = rd->r;

	uint16_t n_new_bins = cf_swap_from_be16(*(uint16_t *)rr->pickle);

	if (n_new_bins > 1) {
		cf_warning_digest(AS_RECORD, rr->keyd, "{%s} record replace: single-bin got %u bins ", ns->name, n_new_bins);
		return AS_ERR_UNKNOWN;
	}

	as_bin stack_bin = { { 0 } };

	rd->n_bins = 1;
	rd->bins = &stack_bin;

	// Fill the new bin and particle.
	cf_ll_buf_define(particles_llb, STACK_PARTICLES_SIZE);

	int result;

	if (n_new_bins == 1 &&
			(result = unpickle_bins(rr, rd, &particles_llb)) != 0) {
		cf_warning_digest(AS_RECORD, rr->keyd, "{%s} record replace: failed unpickle bin ", ns->name);
		cf_ll_buf_free(&particles_llb);
		return result;
	}

	// Apply changes to metadata in as_index needed for and writing.
	index_metadata old_metadata;

	update_index_metadata(rr, &old_metadata, r);

	// Prepare to store or drop key, as determined by message.
	rd->key = rr->key;
	rd->key_size = rr->key_size;

	// Write the record to storage.
	if ((result = as_record_write_from_pickle(rd)) < 0) {
		cf_warning_digest(AS_RECORD, rr->keyd, "{%s} record replace: failed write ", ns->name);
		unwind_index_metadata(&old_metadata, r);
		cf_ll_buf_free(&particles_llb);
		return -result;
	}

	// Now ok to store or drop key, as determined by message.
	as_record_finalize_key(r, ns, rd->key, rd->key_size);

	*is_delete = n_new_bins == 0;

	cf_ll_buf_free(&particles_llb);

	return AS_OK;
}
Beispiel #2
0
int
record_apply_ssd_single_bin(as_remote_record *rr, as_storage_rd *rd,
		bool *is_delete)
{
	// TODO - old pickle - remove in "six months".
	if (rr->is_old_pickle) {
		return old_record_apply_ssd_single_bin(rr, rd, is_delete);
	}

	as_namespace* ns = rr->rsv->ns;
	as_record* r = rd->r;

	uint16_t n_new_bins = rr->n_bins;

	if (n_new_bins > 1) {
		cf_warning_digest(AS_RECORD, rr->keyd, "{%s} record replace: single-bin got %u bins ", ns->name, n_new_bins);
		return AS_ERR_UNKNOWN;
	}

	// Won't use to flatten, but needed to know if bins are in use.
	rd->n_bins = n_new_bins;

	// Apply changes to metadata in as_index needed for and writing.
	index_metadata old_metadata;

	update_index_metadata(rr, &old_metadata, r);

	// Write the record to storage.
	int result = as_record_write_from_pickle(rd);

	if (result < 0) {
		cf_warning_digest(AS_RECORD, rr->keyd, "{%s} record replace: failed write ", ns->name);
		unwind_index_metadata(&old_metadata, r);
		return -result;
	}

	// Now ok to store or drop key, as determined by message.
	as_record_finalize_key(r, ns, rr->key, rr->key_size);

	*is_delete = n_new_bins == 0;

	return AS_OK;
}
Beispiel #3
0
int
handle_msg_key(as_transaction* tr, as_storage_rd* rd)
{
	// Shortcut pointers.
	as_msg* m = &tr->msgp->msg;
	as_namespace* ns = tr->rsv.ns;

	if (rd->r->key_stored == 1) {
		// Key stored for this record - be sure it gets rewritten.

		// This will force a device read for non-data-in-memory, even if
		// must_fetch_data is false! Since there's no advantage to using the
		// loaded block after this if must_fetch_data is false, leave the
		// subsequent code as-is.
		if (! as_storage_record_get_key(rd)) {
			cf_warning_digest(AS_RW, &tr->keyd, "{%s} can't get stored key ",
					ns->name);
			return AS_ERR_UNKNOWN;
		}

		// Check the client-sent key, if any, against the stored key.
		if (as_transaction_has_key(tr) && ! check_msg_key(m, rd)) {
			cf_warning_digest(AS_RW, &tr->keyd, "{%s} key mismatch ", ns->name);
			return AS_ERR_KEY_MISMATCH;
		}
	}
	// If we got a key without a digest, it's an old client, not a cue to store
	// the key. (Remove this check when we're sure all old C clients are gone.)
	else if (as_transaction_has_digest(tr)) {
		// Key not stored for this record - store one if sent from client. For
		// data-in-memory, don't allocate the key until we reach the point of no
		// return. Also don't set AS_INDEX_FLAG_KEY_STORED flag until then.
		if (! get_msg_key(tr, rd)) {
			return AS_ERR_UNSUPPORTED_FEATURE;
		}
	}

	return 0;
}
Beispiel #4
0
/**
 * Send failure notification of general UDF execution, but check for special
 * LDT errors and return specific Wire Protocol error codes for these cases:
 * (1) Record not found (2)
 * (2) LDT Collection item not found (125)
 *
 * All other errors get the generic 100 (UDF FAIL) code.
 */
static inline int
process_udf_failure(udf_call *call, const as_string *s, cf_dyn_buf *db)
{
	char *val = as_string_tostring(s);
	size_t vlen = as_string_len((as_string *)s); // TODO - make as_string_len() take const
	long error_code = ldt_get_error_code(val, vlen);

	if (error_code) {

		if (error_code == AS_PROTO_RESULT_FAIL_NOTFOUND ||
			error_code == AS_PROTO_RESULT_FAIL_COLLECTION_ITEM_NOT_FOUND) {

			call->tr->result_code = (uint8_t)error_code;
			// Send an "empty" response, with no failure bin.
			as_transaction *    tr          = call->tr;

			if (db) {
				size_t msg_sz = 0;
				uint8_t *msgp = (uint8_t *)as_msg_make_response_msg(
						tr->result_code, 0, 0, NULL, NULL, 0, tr->rsv.ns, NULL,
						&msg_sz, as_transaction_trid(tr), NULL);

				if (! msgp)	{
					cf_warning_digest(AS_RW, &tr->keyd, "{%s} LDT UDF failed to make response msg ", tr->rsv.ns->name);
					return -1;
				}

				// Stash the message, to be sent later.
				db->buf = msgp;
				db->is_stack = false;
				db->alloc_sz = msg_sz;
				db->used_sz = msg_sz;
			}
			else {
				single_transaction_response(tr, tr->rsv.ns, NULL/*ops*/,
						NULL /*bin*/, 0 /*nbins*/, 0, 0, NULL, NULL);
			}
			return 0;
		}
	}

	cf_debug(AS_UDF, "Non-special LDT or General UDF Error(%s)", (char *) val);

	call->tr->result_code = AS_PROTO_RESULT_FAIL_UDF_EXECUTION;
	return process_failure(call, as_string_toval(s), db);
}
as_rec *
crec_create(ldt_record *lrecord)
{
	// Generate Key Digest
	udf_record *h_urecord = (udf_record *) as_rec_source(lrecord->h_urec);
	cf_digest keyd        = h_urecord->r_ref->r->key;
	as_namespace *ns      = h_urecord->tr->rsv.ns;
	int retry_cnt         = 0;
	ldt_slot *lslotp      = slot_lookup_free(lrecord, "crec_create");

	if (!lslotp) {
		cf_crash(AS_LDT, "Allocation error !!!");
	}
	slot_init(lslotp, lrecord);

	while (retry_cnt++ < LDT_SUBRECORD_RANDOMIZER_MAX_RETRIES) {

		as_ldt_digest_randomizer(&keyd);
		as_ldt_subdigest_setversion(&keyd, lrecord->version);
		slot_setup_digest(lslotp, &keyd);

		int rv = as_aerospike_rec_create(lrecord->as, lslotp->c_urec_p);

		// rv == 0 if successful
		// rv == 1 if record is already found retry
		// other wise failure
		if (rv == 0) {
			cf_detail_digest(AS_LDT, &keyd, "Crec Create:Ptr(%p) Digest: version %ld", lslotp->c_urec_p, lrecord->version);
			as_val_reserve(lslotp->c_urec_p);
			return lslotp->c_urec_p;
		}

		if (rv != 1) {
			cf_warning(AS_LDT, "crec_create: LDT Sub-Record Create Error [rv=%d]... Fail", rv);
			break;
		}
		cf_atomic64_incr(&ns->lstats.ldt_randomizer_retry);
	}

	slot_destroy(lslotp, lrecord);
	cf_warning_digest(AS_LDT, &keyd, "ldt_aerospike_crec_create : Create failed after %d retries", retry_cnt);
	return NULL;
}
Beispiel #6
0
transaction_status
read_local(as_transaction* tr)
{
	as_msg* m = &tr->msgp->msg;
	as_namespace* ns = tr->rsv.ns;

	as_index_ref r_ref;

	if (as_record_get(tr->rsv.tree, &tr->keyd, &r_ref) != 0) {
		read_local_done(tr, NULL, NULL, AS_ERR_NOT_FOUND);
		return TRANS_DONE_ERROR;
	}

	as_record* r = r_ref.r;

	// Check if it's an expired or truncated record.
	if (as_record_is_doomed(r, ns)) {
		read_local_done(tr, &r_ref, NULL, AS_ERR_NOT_FOUND);
		return TRANS_DONE_ERROR;
	}

	int result = repl_state_check(r, tr);

	if (result != 0) {
		if (result == -3) {
			read_local_done(tr, &r_ref, NULL, AS_ERR_UNAVAILABLE);
			return TRANS_DONE_ERROR;
		}

		// No response sent to origin.
		as_record_done(&r_ref, ns);
		return result == 1 ? TRANS_IN_PROGRESS : TRANS_WAITING;
	}

	// Check if it's a tombstone.
	if (! as_record_is_live(r)) {
		read_local_done(tr, &r_ref, NULL, AS_ERR_NOT_FOUND);
		return TRANS_DONE_ERROR;
	}

	as_storage_rd rd;

	as_storage_record_open(ns, r, &rd);

	// If configuration permits, allow reads to use page cache.
	rd.read_page_cache = ns->storage_read_page_cache;

	// Check the key if required.
	// Note - for data-not-in-memory "exists" ops, key check is expensive!
	if (as_transaction_has_key(tr) &&
			as_storage_record_get_key(&rd) && ! check_msg_key(m, &rd)) {
		read_local_done(tr, &r_ref, &rd, AS_ERR_KEY_MISMATCH);
		return TRANS_DONE_ERROR;
	}

	if ((m->info1 & AS_MSG_INFO1_GET_NO_BINS) != 0) {
		tr->generation = r->generation;
		tr->void_time = r->void_time;
		tr->last_update_time = r->last_update_time;

		read_local_done(tr, &r_ref, &rd, AS_OK);
		return TRANS_DONE_SUCCESS;
	}

	if ((result = as_storage_rd_load_n_bins(&rd)) < 0) {
		cf_warning_digest(AS_RW, &tr->keyd, "{%s} read_local: failed as_storage_rd_load_n_bins() ", ns->name);
		read_local_done(tr, &r_ref, &rd, -result);
		return TRANS_DONE_ERROR;
	}

	as_bin stack_bins[ns->storage_data_in_memory ? 0 : rd.n_bins];

	if ((result = as_storage_rd_load_bins(&rd, stack_bins)) < 0) {
		cf_warning_digest(AS_RW, &tr->keyd, "{%s} read_local: failed as_storage_rd_load_bins() ", ns->name);
		read_local_done(tr, &r_ref, &rd, -result);
		return TRANS_DONE_ERROR;
	}

	if (! as_bin_inuse_has(&rd)) {
		cf_warning_digest(AS_RW, &tr->keyd, "{%s} read_local: found record with no bins ", ns->name);
		read_local_done(tr, &r_ref, &rd, AS_ERR_UNKNOWN);
		return TRANS_DONE_ERROR;
	}

	uint32_t bin_count = (m->info1 & AS_MSG_INFO1_GET_ALL) != 0 ?
			rd.n_bins : m->n_ops;

	as_msg_op* ops[bin_count];
	as_msg_op** p_ops = ops;
	as_bin* response_bins[bin_count];
	uint16_t n_bins = 0;

	as_bin result_bins[bin_count];
	uint32_t n_result_bins = 0;

	if ((m->info1 & AS_MSG_INFO1_GET_ALL) != 0) {
		p_ops = NULL;
		n_bins = rd.n_bins;
		as_bin_get_all_p(&rd, response_bins);
	}
	else {
		if (m->n_ops == 0) {
			cf_warning_digest(AS_RW, &tr->keyd, "{%s} read_local: bin op(s) expected, none present ", ns->name);
			read_local_done(tr, &r_ref, &rd, AS_ERR_PARAMETER);
			return TRANS_DONE_ERROR;
		}

		bool respond_all_ops = (m->info2 & AS_MSG_INFO2_RESPOND_ALL_OPS) != 0;

		as_msg_op* op = 0;
		int n = 0;

		while ((op = as_msg_op_iterate(m, op, &n)) != NULL) {
			if (op->op == AS_MSG_OP_READ) {
				as_bin* b = as_bin_get_from_buf(&rd, op->name, op->name_sz);

				if (b || respond_all_ops) {
					ops[n_bins] = op;
					response_bins[n_bins++] = b;
				}
			}
			else if (op->op == AS_MSG_OP_CDT_READ) {
				as_bin* b = as_bin_get_from_buf(&rd, op->name, op->name_sz);

				if (b) {
					as_bin* rb = &result_bins[n_result_bins];
					as_bin_set_empty(rb);

					if ((result = as_bin_cdt_read_from_client(b, op, rb)) < 0) {
						cf_warning_digest(AS_RW, &tr->keyd, "{%s} read_local: failed as_bin_cdt_read_from_client() ", ns->name);
						destroy_stack_bins(result_bins, n_result_bins);
						read_local_done(tr, &r_ref, &rd, -result);
						return TRANS_DONE_ERROR;
					}

					if (as_bin_inuse(rb)) {
						n_result_bins++;
						ops[n_bins] = op;
						response_bins[n_bins++] = rb;
					}
					else if (respond_all_ops) {
						ops[n_bins] = op;
						response_bins[n_bins++] = NULL;
					}
				}
				else if (respond_all_ops) {
					ops[n_bins] = op;
					response_bins[n_bins++] = NULL;
				}
			}
			else {
				cf_warning_digest(AS_RW, &tr->keyd, "{%s} read_local: unexpected bin op %u ", ns->name, op->op);
				destroy_stack_bins(result_bins, n_result_bins);
				read_local_done(tr, &r_ref, &rd, AS_ERR_PARAMETER);
				return TRANS_DONE_ERROR;
			}
		}
	}

	cf_dyn_buf_define_size(db, 16 * 1024);

	if (tr->origin != FROM_BATCH) {
		db.used_sz = db.alloc_sz;
		db.buf = (uint8_t*)as_msg_make_response_msg(tr->result_code,
				r->generation, r->void_time, p_ops, response_bins, n_bins, ns,
				(cl_msg*)dyn_bufdb, &db.used_sz, as_transaction_trid(tr));

		db.is_stack = db.buf == dyn_bufdb;
		// Note - not bothering to correct alloc_sz if buf was allocated.
	}
	else {
		tr->generation = r->generation;
		tr->void_time = r->void_time;
		tr->last_update_time = r->last_update_time;

		// Since as_batch_add_result() constructs response directly in shared
		// buffer to avoid extra copies, can't use db.
		send_read_response(tr, p_ops, response_bins, n_bins, NULL);
	}

	destroy_stack_bins(result_bins, n_result_bins);
	as_storage_record_close(&rd);
	as_record_done(&r_ref, ns);

	// Now that we're not under the record lock, send the message we just built.
	if (db.used_sz != 0) {
		send_read_response(tr, NULL, NULL, 0, &db);

		cf_dyn_buf_free(&db);
		tr->from.proto_fd_h = NULL;
	}

	return TRANS_DONE_SUCCESS;
}
/*
 * The work horse function to process the acknowledgment for the duplicate op.
 * It is received after the intended node has finished performing the op. In
 * case of success the op would have been successfully performed and replicated.
 * In case of failure the op would not have been performed anywhere.
 *
 * The retransmit is handled by making sure op hangs from the write hash as long
 * as it is not applied or failed. Any attempt to perform next operation has to
 * hang behind it unless it is finished. Also operation is assigned a timestamp
 * so that there is some protection in case the op arrives out of order, or the
 * same op comes back again. That would be a duplicate op ...
 *
 * Received a op message - I'm a winner duplicate on this partition. Perform the
 * UDF op and replicate to all the nodes in the replica list. We only replicate
 * the subrecord if the partition is in subrecord migration phase. If not, ship
 * both subrecord and record. In case partition is read replica on this node, do
 * the write and signal back that I'm done.
 *
 * THUS - PROLE SIDE
 *
 * is_write is misnamed. Differentiates between the 'duplicate' phase and the
 *    'operation' phase. If is_write == false, we're in the 'duplicate' phase.
 *
 * Algorithm
 *
 * This code is called when op is shipped to the winner node.
 *
 * 1. Assert that current node is indeed the winner node.
 * 2. Assert the cluster key matches.
 * 3. Create a transaction and apply the UDF. Create an internal transaction and
 *    make sure it does some sort of reservation and applies the write and
 *    replicates to replica set. Once the write is done it sends the op ack.
 *
 *    TODO: How do you handle retransmits?
 *    TODO: How do you handle partition reservation? Is it something special.
 *    TODO: How to send along with replication request? Same infra should be
 *          used by normal replication as well.
 *
 *    There won't be any deadlock because the requests are triggered from the
 *    write. Get down to the udf apply code. Replicate to replica set and then
 *    make sure the response is sent back to the originating node. This node has
 *    to make sure the replication actually did succeed.
 *
 * In the response code you need to add the callback function.
 */
int
as_proxy_shipop_response_hdlr(msg *m, proxy_request *pr, bool *free_msg)
{
	int rv            = -1;
	write_request *wr = pr->wr;
	if (!wr) {
		return -1;
	}
	cf_assert((pr->fd_h == NULL), AS_PROXY, CF_WARNING, "fd_h set for shipop proxy response");

	// If there is a write request hanging from pr then this is a response to
	// the proxy ship op request. This node is the resolving node (node @ which
	// duplicate resolution was triggered). It could be:
	// 1. Originating node [where the request was sent from the client] - in
	//    that case send response back to the client directly.
	// 2. Non-originating node [where the request arrived as a regular proxy] -
	//    in that case send response back to the proxy originating node.

	// Case 1: Non-originating node.
	if (wr->proxy_msg) {
		// Remember that "digest" gets printed at the end of cf_detail_digest().
		// Fake the ORIGINATING Proxy tid
		uint32_t transaction_id = 0;
		msg_get_uint32(wr->proxy_msg, PROXY_FIELD_TID, &transaction_id);
		msg_set_uint32(m, PROXY_FIELD_TID, transaction_id);
		cf_detail_digest(AS_PROXY, &(wr->keyd), "SHIPPED_OP NON-ORIG :: Got Op Response(%p) :", wr);
		cf_detail_digest(AS_PROXY, &(wr->keyd), "SHIPPED_OP NON-ORIG :: Back Forwarding Response for tid (%d). : ", transaction_id);
		if (0 != (rv = as_fabric_send(wr->proxy_node, m, AS_FABRIC_PRIORITY_MEDIUM))) {
			cf_detail_digest(AS_PROXY, &wr->keyd, "SHIPPED_OP NONORIG Failed Forwarding Response");
			as_fabric_msg_put(m);
		}
		*free_msg = false;
	}
	// Case 2: Originating node.
	else {
		cf_detail_digest(AS_PROXY, &wr->keyd, "SHIPPED_OP ORIG Got Op Response");
		pthread_mutex_lock(&wr->lock);
		if (wr->proto_fd_h) {
			if (!wr->proto_fd_h->fd) {
				cf_warning_digest(AS_PROXY, &wr->keyd, "SHIPPED_OP ORIG Missing fd in proto_fd ");
			}
			else {
				as_proto *proto;
				size_t proto_sz;
				if (0 != msg_get_buf(m, PROXY_FIELD_AS_PROTO, (byte **) &proto, &proto_sz, MSG_GET_DIRECT)) {
					cf_info(AS_PROXY, "msg get buf failed!");
				}
				size_t pos = 0;
				while (pos < proto_sz) {
					rv = send(wr->proto_fd_h->fd, (((uint8_t *)proto) + pos), proto_sz - pos, MSG_NOSIGNAL);
					if (rv > 0) {
						pos += rv;
					}
					else if (rv < 0) {
						if (errno != EWOULDBLOCK) {
							// Common message when a client aborts.
							cf_debug(AS_PROTO, "protocol proxy write fail: fd %d "
									"sz %d pos %d rv %d errno %d",
									wr->proto_fd_h->fd, proto_sz, pos, rv, errno);
							shutdown(wr->proto_fd_h->fd, SHUT_RDWR);
							break;
						}
						usleep(1); // yield
					}
					else {
						cf_info(AS_PROTO, "protocol write fail zero return: fd %d sz %d pos %d ",
								wr->proto_fd_h->fd, proto_sz, pos);
						shutdown(wr->proto_fd_h->fd, SHUT_RDWR);
						break;
					}
				}
				cf_detail_digest(AS_PROXY, &wr->keyd, "SHIPPED_OP ORIG Response Sent to Client");
			}
			wr->proto_fd_h->t_inprogress = false;
			AS_RELEASE_FILE_HANDLE(wr->proto_fd_h);
			wr->proto_fd_h = 0;
		} else {
			// this may be NULL if the request has already timedout and the wr proto_fd_h
			// will be cleaned up by then
			cf_detail_digest(AS_PROXY, &wr->keyd, "SHIPPED_OP ORIG Missing proto_fd ");

			// Note: This may be needed if this is node where internal scan or query
			// UDF is initiated where it happens so that there is migration is going
			// on and the request get routed to the remote node which is winning node
			// This request may need the req_cb to be called.
			if (udf_rw_needcomplete_wr(wr)) {
				as_transaction tr;
				write_request_init_tr(&tr, wr);
				udf_rw_complete(&tr, 0, __FILE__, __LINE__);
				if (tr.proto_fd_h) {
					tr.proto_fd_h->t_inprogress = false;
					AS_RELEASE_FILE_HANDLE(tr.proto_fd_h);
					tr.proto_fd_h = 0;
				}
			}
		}
		pthread_mutex_unlock(&wr->lock);
	}

	// This node is shipOp initiator. Remove it from the Global
	// hash
	global_keyd gk;
	gk.ns_id = wr->rsv.ns->id;
	gk.keyd = wr->keyd;
	g_write_hash_delete(&gk);

	WR_RELEASE(pr->wr);
	pr->wr = NULL;
	return 0;
}
Beispiel #8
0
// TODO - old pickle - remove in "six months".
int
old_record_apply_dim(as_remote_record *rr, as_storage_rd *rd, bool skip_sindex,
		bool *is_delete)
{
	as_namespace* ns = rr->rsv->ns;
	as_record* r = rd->r;

	// Set rd->n_bins!
	as_storage_rd_load_n_bins(rd);

	// Set rd->bins!
	as_storage_rd_load_bins(rd, NULL);

	// For memory accounting, note current usage.
	uint64_t memory_bytes = as_storage_record_get_n_bytes_memory(rd);

	// Keep old bins intact for sindex adjustment and unwinding.
	uint16_t n_old_bins = rd->n_bins;
	as_bin* old_bins = rd->bins;

	uint16_t n_new_bins = cf_swap_from_be16(*(uint16_t *)rr->pickle);
	as_bin new_bins[n_new_bins];

	memset(new_bins, 0, sizeof(new_bins));
	rd->n_bins = n_new_bins;
	rd->bins = new_bins;

	// Fill the new bins and particles.
	int result = unpickle_bins(rr, rd, NULL);

	if (result != 0) {
		cf_warning_digest(AS_RECORD, rr->keyd, "{%s} record replace: failed unpickle bins ", ns->name);
		destroy_stack_bins(new_bins, n_new_bins);
		return result;
	}

	// Apply changes to metadata in as_index needed for and writing.
	index_metadata old_metadata;

	update_index_metadata(rr, &old_metadata, r);

	// Prepare to store or drop key, as determined by message.
	rd->key = rr->key;
	rd->key_size = rr->key_size;

	// Write the record to storage.
	if ((result = as_record_write_from_pickle(rd)) < 0) {
		cf_warning_digest(AS_RECORD, rr->keyd, "{%s} record replace: failed write ", ns->name);
		unwind_index_metadata(&old_metadata, r);
		destroy_stack_bins(new_bins, n_new_bins);
		return -result;
	}

	// Success - adjust sindex, looking at old and new bins.
	if (! (skip_sindex &&
			next_generation(r->generation, (uint16_t)rr->generation, ns)) &&
					record_has_sindex(r, ns)) {
		write_sindex_update(ns, as_index_get_set_name(r, ns), rr->keyd,
				old_bins, n_old_bins, new_bins, n_new_bins);
	}

	// Cleanup - destroy relevant bins, can't unwind after.
	destroy_stack_bins(old_bins, n_old_bins);

	// Fill out new_bin_space.
	as_bin_space* new_bin_space = NULL;

	if (n_new_bins != 0) {
		new_bin_space = (as_bin_space*)
				cf_malloc_ns(sizeof(as_bin_space) + sizeof(new_bins));

		new_bin_space->n_bins = rd->n_bins;
		memcpy((void*)new_bin_space->bins, new_bins, sizeof(new_bins));
	}

	// Swizzle the index element's as_bin_space pointer.
	as_bin_space* old_bin_space = as_index_get_bin_space(r);

	if (old_bin_space) {
		cf_free(old_bin_space);
	}

	as_index_set_bin_space(r, new_bin_space);

	// Now ok to store or drop key, as determined by message.
	as_record_finalize_key(r, ns, rd->key, rd->key_size);

	as_storage_record_adjust_mem_stats(rd, memory_bytes);
	*is_delete = n_new_bins == 0;

	return AS_OK;
}
Beispiel #9
0
// TODO - old pickle - remove in "six months".
int
old_record_apply_dim_single_bin(as_remote_record *rr, as_storage_rd *rd,
		bool *is_delete)
{
	as_namespace* ns = rr->rsv->ns;
	as_record* r = rd->r;

	rd->n_bins = 1;

	// Set rd->bins!
	as_storage_rd_load_bins(rd, NULL);

	// For memory accounting, note current usage.
	uint64_t memory_bytes = 0;

	// TODO - as_storage_record_get_n_bytes_memory() could check bins in use.
	if (as_bin_inuse(rd->bins)) {
		memory_bytes = as_storage_record_get_n_bytes_memory(rd);
	}

	uint16_t n_new_bins = cf_swap_from_be16(*(uint16_t *)rr->pickle);

	if (n_new_bins > 1) {
		cf_warning_digest(AS_RECORD, rr->keyd, "{%s} record replace: single-bin got %u bins ", ns->name, n_new_bins);
		return AS_ERR_UNKNOWN;
	}

	// Keep old bin intact for unwinding, clear record bin for incoming.
	as_bin old_bin;

	as_single_bin_copy(&old_bin, rd->bins);
	as_bin_set_empty(rd->bins);

	int result;

	// Fill the new bins and particles.
	if (n_new_bins == 1 &&
			(result = unpickle_bins(rr, rd, NULL)) != 0) {
		cf_warning_digest(AS_RECORD, rr->keyd, "{%s} record replace: failed unpickle bin ", ns->name);
		unwind_dim_single_bin(&old_bin, rd->bins);
		return result;
	}

	// Apply changes to metadata in as_index needed for and writing.
	index_metadata old_metadata;

	update_index_metadata(rr, &old_metadata, r);

	// Write the record to storage.
	if ((result = as_record_write_from_pickle(rd)) < 0) {
		cf_warning_digest(AS_RECORD, rr->keyd, "{%s} record replace: failed write ", ns->name);
		unwind_index_metadata(&old_metadata, r);
		unwind_dim_single_bin(&old_bin, rd->bins);
		return -result;
	}

	// Cleanup - destroy old bin, can't unwind after.
	as_bin_particle_destroy(&old_bin, true);

	as_storage_record_adjust_mem_stats(rd, memory_bytes);
	*is_delete = n_new_bins == 0;

	return AS_OK;
}
Beispiel #10
0
int
record_apply_ssd(as_remote_record *rr, as_storage_rd *rd, bool skip_sindex,
		bool *is_delete)
{
	// TODO - old pickle - remove in "six months".
	if (rr->is_old_pickle) {
		return old_record_apply_ssd(rr, rd, skip_sindex, is_delete);
	}

	as_namespace* ns = rr->rsv->ns;
	as_record* r = rd->r;

	bool has_sindex = ! (skip_sindex &&
			next_generation(r->generation, (uint16_t)rr->generation, ns)) &&
					record_has_sindex(r, ns);

	int result;

	uint16_t n_old_bins = 0;
	as_bin *old_bins = NULL;

	uint16_t n_new_bins = rr->n_bins;
	as_bin *new_bins = NULL;

	if (has_sindex) {
		// TODO - separate function?
		if ((result = as_storage_rd_load_n_bins(rd)) < 0) {
			cf_warning_digest(AS_RECORD, rr->keyd, "{%s} record replace: failed load n-bins ", ns->name);
			return -result;
		}

		n_old_bins = rd->n_bins;
		old_bins = alloca(n_old_bins * sizeof(as_bin));

		if ((result = as_storage_rd_load_bins(rd, old_bins)) < 0) {
			cf_warning_digest(AS_RECORD, rr->keyd, "{%s} record replace: failed load bins ", ns->name);
			return -result;
		}

		// Won't use to flatten.
		rd->bins = NULL;

		if (n_new_bins != 0) {
			new_bins = alloca(n_new_bins * sizeof(as_bin));
			memset(new_bins, 0, n_new_bins * sizeof(as_bin));

			if ((result = as_flat_unpack_remote_bins(rr, new_bins)) != 0) {
				cf_warning_digest(AS_RECORD, rr->keyd, "{%s} record replace: failed unpickle bins ", ns->name);
				return -result;
			}
		}
	}

	// Won't use to flatten, but needed to know if bins are in use.
	rd->n_bins = n_new_bins;

	// Apply changes to metadata in as_index needed for and writing.
	index_metadata old_metadata;

	update_index_metadata(rr, &old_metadata, r);

	// Write the record to storage.
	if ((result = as_record_write_from_pickle(rd)) < 0) {
		cf_warning_digest(AS_RECORD, rr->keyd, "{%s} record replace: failed write ", ns->name);
		unwind_index_metadata(&old_metadata, r);
		return -result;
	}

	// Success - adjust sindex, looking at old and new bins.
	if (has_sindex) {
		write_sindex_update(ns, as_index_get_set_name(r, ns), rr->keyd,
				old_bins, n_old_bins, new_bins, n_new_bins);
	}

	// Now ok to store or drop key, as determined by message.
	as_record_finalize_key(r, ns, rr->key, rr->key_size);

	*is_delete = n_new_bins == 0;

	return AS_OK;
}
Beispiel #11
0
int
record_apply_dim_single_bin(as_remote_record *rr, as_storage_rd *rd,
		bool *is_delete)
{
	// TODO - old pickle - remove in "six months".
	if (rr->is_old_pickle) {
		return old_record_apply_dim_single_bin(rr, rd, is_delete);
	}

	as_namespace* ns = rr->rsv->ns;
	as_record* r = rd->r;

	rd->n_bins = 1;

	// Set rd->bins!
	as_storage_rd_load_bins(rd, NULL);

	// For memory accounting, note current usage.
	uint64_t memory_bytes = 0;

	// TODO - as_storage_record_get_n_bytes_memory() could check bins in use.
	if (as_bin_inuse(rd->bins)) {
		memory_bytes = as_storage_record_get_n_bytes_memory(rd);
	}

	uint16_t n_new_bins = rr->n_bins;

	if (n_new_bins > 1) {
		cf_warning_digest(AS_RECORD, rr->keyd, "{%s} record replace: single-bin got %u bins ", ns->name, n_new_bins);
		return AS_ERR_UNKNOWN;
	}

	// Keep old bin for unwinding.
	as_bin old_bin;

	as_single_bin_copy(&old_bin, rd->bins);

	// No stack new bin - simpler to operate directly on bin embedded in index.
	as_bin_set_empty(rd->bins);

	int result;

	// Fill the new bins and particles.
	if (n_new_bins == 1 &&
			(result = as_flat_unpack_remote_bins(rr, rd->bins)) != 0) {
		cf_warning_digest(AS_RECORD, rr->keyd, "{%s} record replace: failed unpickle bin ", ns->name);
		unwind_dim_single_bin(&old_bin, rd->bins);
		return -result;
	}

	// Won't use to flatten, but needed to know if bins are in use. Amazingly,
	// rd->n_bins 0 ok adjusting memory stats. Also, rd->bins already filled.
	rd->n_bins = n_new_bins;

	// Apply changes to metadata in as_index needed for and writing.
	index_metadata old_metadata;

	update_index_metadata(rr, &old_metadata, r);

	// Write the record to storage.
	if ((result = as_record_write_from_pickle(rd)) < 0) {
		cf_warning_digest(AS_RECORD, rr->keyd, "{%s} record replace: failed write ", ns->name);
		unwind_index_metadata(&old_metadata, r);
		unwind_dim_single_bin(&old_bin, rd->bins);
		return -result;
	}

	// Cleanup - destroy old bin, can't unwind after.
	as_bin_particle_destroy(&old_bin, true);

	as_storage_record_adjust_mem_stats(rd, memory_bytes);
	*is_delete = n_new_bins == 0;

	return AS_OK;
}
Beispiel #12
0
// TODO - old pickle - remove in "six months".
int
old_record_apply_ssd(as_remote_record *rr, as_storage_rd *rd, bool skip_sindex,
		bool *is_delete)
{
	as_namespace* ns = rr->rsv->ns;
	as_record* r = rd->r;
	bool has_sindex = ! (skip_sindex &&
			next_generation(r->generation, (uint16_t)rr->generation, ns)) &&
					record_has_sindex(r, ns);

	uint16_t n_old_bins = 0;
	int result;

	if (has_sindex) {
		// Set rd->n_bins!
		if ((result = as_storage_rd_load_n_bins(rd)) < 0) {
			cf_warning_digest(AS_RECORD, rr->keyd, "{%s} record replace: failed load n-bins ", ns->name);
			return -result;
		}

		n_old_bins = rd->n_bins;
	}

	as_bin old_bins[n_old_bins];

	if (has_sindex) {
		// Set rd->bins!
		if ((result = as_storage_rd_load_bins(rd, old_bins)) < 0) {
			cf_warning_digest(AS_RECORD, rr->keyd, "{%s} record replace: failed load bins ", ns->name);
			return -result;
		}
	}

	// Stack space for resulting record's bins.
	uint16_t n_new_bins = cf_swap_from_be16(*(uint16_t *)rr->pickle);
	as_bin new_bins[n_new_bins];

	memset(new_bins, 0, sizeof(new_bins));
	rd->n_bins = n_new_bins;
	rd->bins = new_bins;

	// Fill the new bins and particles.
	cf_ll_buf_define(particles_llb, STACK_PARTICLES_SIZE);

	if ((result = unpickle_bins(rr, rd, &particles_llb)) != 0) {
		cf_warning_digest(AS_RECORD, rr->keyd, "{%s} record replace: failed unpickle bins ", ns->name);
		cf_ll_buf_free(&particles_llb);
		return result;
	}

	// Apply changes to metadata in as_index needed for and writing.
	index_metadata old_metadata;

	update_index_metadata(rr, &old_metadata, r);

	// Prepare to store or drop key, as determined by message.
	rd->key = rr->key;
	rd->key_size = rr->key_size;

	// Write the record to storage.
	if ((result = as_record_write_from_pickle(rd)) < 0) {
		cf_warning_digest(AS_RECORD, rr->keyd, "{%s} record replace: failed write ", ns->name);
		unwind_index_metadata(&old_metadata, r);
		cf_ll_buf_free(&particles_llb);
		return -result;
	}

	// Success - adjust sindex, looking at old and new bins.
	if (has_sindex) {
		write_sindex_update(ns, as_index_get_set_name(r, ns), rr->keyd,
				old_bins, n_old_bins, new_bins, n_new_bins);
	}

	// Now ok to store or drop key, as determined by message.
	as_record_finalize_key(r, ns, rd->key, rd->key_size);

	*is_delete = n_new_bins == 0;

	cf_ll_buf_free(&particles_llb);

	return AS_OK;
}
int
write_replica(as_partition_reservation* rsv, cf_digest* keyd,
		uint8_t* pickled_buf, size_t pickled_sz,
		const as_rec_props* p_rec_props, as_generation generation,
		uint32_t void_time, uint64_t last_update_time, cf_node master,
		uint32_t info, ldt_prole_info* linfo)
{
	as_namespace* ns = rsv->ns;

	if (! as_storage_has_space(rsv->ns)) {
		cf_warning(AS_RW, "{%s} write_replica: drives full", ns->name);
		return AS_PROTO_RESULT_FAIL_PARTITION_OUT_OF_SPACE;
	}

	as_index_tree* tree = rsv->tree;
	bool is_subrec = false;
	bool is_ldt_parent = false;

	if (ns->ldt_enabled) {
		if ((info & RW_INFO_LDT_SUBREC) != 0 || (info & RW_INFO_LDT_ESR) != 0) {
			tree = rsv->sub_tree;
			is_subrec = true;
		}
		else if ((info & RW_INFO_LDT_PARENTREC) != 0) {
			is_ldt_parent = true;
		}
	}

	as_index_ref r_ref;
	r_ref.skip_lock = false;

	int rv = as_record_get_create(tree, keyd, &r_ref, ns, is_subrec);

	if (rv < 0) {
		cf_warning_digest(AS_RW, keyd, "{%s} write_replica: fail as_record_get_create() ", ns->name);
		return AS_PROTO_RESULT_FAIL_UNKNOWN;
	}

	as_record* r = r_ref.r;
	as_storage_rd rd;
	bool is_create = false;

	if (rv == 1) {
		as_storage_record_create(ns, r, &rd, keyd);
		is_create = true;
	}
	else {
		as_storage_record_open(ns, r, &rd, keyd);
	}

	bool has_sindex = (info & RW_INFO_SINDEX_TOUCHED) != 0;

	rd.ignore_record_on_device = ! has_sindex && ! is_ldt_parent;
	rd.n_bins = as_bin_get_n_bins(r, &rd);

	// TODO - we really need an inline utility for this!
	uint16_t newbins = ntohs(*(uint16_t*)pickled_buf);

	if (! rd.ns->storage_data_in_memory && ! rd.ns->single_bin &&
			newbins > rd.n_bins) {
		rd.n_bins = newbins;
	}

	as_bin stack_bins[rd.ns->storage_data_in_memory ? 0 : rd.n_bins];

	rd.bins = as_bin_get_all(r, &rd, stack_bins);

	uint32_t stack_particles_sz = rd.ns->storage_data_in_memory ?
			0 : as_record_buf_get_stack_particles_sz(pickled_buf);
	uint8_t stack_particles[stack_particles_sz + 256];
	uint8_t* p_stack_particles = stack_particles;
	// + 256 for LDT control bin, to hold version.

	if (! ldt_get_prole_version(rsv, keyd, linfo, info, &rd, is_create)) {
		if (is_create) {
			as_index_delete(tree, keyd);
		}

		as_storage_record_close(r, &rd);
		as_record_done(&r_ref, ns);

		return AS_PROTO_RESULT_FAIL_UNKNOWN;
	}

	uint64_t memory_bytes = 0;

	if (! is_create) {
		memory_bytes = as_storage_record_get_n_bytes_memory(&rd);
	}

	as_record_set_properties(&rd, p_rec_props);

	if (as_record_unpickle_replace(r, &rd, pickled_buf, pickled_sz,
			&p_stack_particles, has_sindex) != 0) {
		if (is_create) {
			as_index_delete(tree, keyd);
		}

		as_storage_record_close(r, &rd);
		as_record_done(&r_ref, ns);

		return AS_PROTO_RESULT_FAIL_UNKNOWN; // TODO - better granularity?
	}

	r->generation = generation;
	r->void_time = void_time;
	r->last_update_time = last_update_time;

	as_storage_record_adjust_mem_stats(&rd, memory_bytes);

	uint64_t version_to_set = 0;
	bool set_version = false;

	if (is_ldt_parent) {
		if (linfo->replication_partition_version_match &&
				linfo->ldt_prole_version_set) {
			version_to_set = linfo->ldt_prole_version;
			set_version = true;
		}
		else if (! linfo->replication_partition_version_match) {
			version_to_set = linfo->ldt_source_version;
			set_version = true;
		}
	}

	if (set_version) {
		int ldt_rv = as_ldt_parent_storage_set_version(&rd, version_to_set,
				p_stack_particles, __FILE__, __LINE__);

		if (ldt_rv < 0) {
			cf_warning(AS_LDT, "write_replica: LDT parent storage version set failed %d", ldt_rv);
			// TODO - roll back.
		}
	}

	bool is_delete = false;

	if (! as_bin_inuse_has(&rd)) {
		// A master write that deletes a record by deleting (all) bins sends a
		// binless pickle that ends up here.
		is_delete = true;
		as_index_delete(tree, keyd);
	}

	as_storage_record_write(r, &rd);
	as_storage_record_close(r, &rd);

	uint16_t set_id = as_index_get_set_id(r);

	as_record_done(&r_ref, ns);

	// Don't send an XDR delete if it's disallowed.
	if (is_delete && ! is_xdr_delete_shipping_enabled()) {
		// TODO - should we also not ship if there was no record here before?
		return AS_PROTO_RESULT_OK;
	}

	// Do XDR write if the write is a non-XDR write or forwarding is enabled.
	if ((info & RW_INFO_XDR) == 0 ||
			is_xdr_forwarding_enabled() || ns->ns_forward_xdr_writes) {
		xdr_write(ns, *keyd, generation, master, is_delete, set_id, NULL);
	}

	return AS_PROTO_RESULT_OK;
}
Beispiel #14
0
/* Workhorse function to send response back to the client after UDF execution.
 *
 * Assumption: The call should be setup properly pointing to the tr.
 *
 * Special Handling: If it is background udf job do not send any
 * 					 response to client
 */
int
process_response(udf_call *call, const char *bin_name, const as_val *val, cf_dyn_buf *db)
{
	// NO response if background UDF
	if (call->def->type == AS_UDF_OP_BACKGROUND) {
		return 0;
	}
	// Note - this function quietly handles a null val. The response call will
	// be given a bin with a name but not 'in use', and it does the right thing.

	as_bin stack_bin;
	as_bin *bin = &stack_bin;

	uint32_t particle_size = as_particle_size_from_asval(val);

	static const size_t MAX_STACK_SIZE = 32 * 1024;
	uint8_t stack_particle[particle_size > MAX_STACK_SIZE ? 0 : particle_size];
	uint8_t *particle_buf = stack_particle;

	if (particle_size > MAX_STACK_SIZE) {
		particle_buf = (uint8_t *)cf_malloc(particle_size);

		if (! particle_buf) {
			cf_warning(AS_UDF, "failed alloc for particle size %u", particle_size);
			return -1;
		}
	}

	as_transaction *tr = call->tr;
	as_namespace *ns = tr->rsv.ns;

	as_bin_init(ns, bin, bin_name);
	as_bin_particle_stack_from_asval(bin, particle_buf, val);

	if (db) {
		size_t msg_sz = 0;
		uint8_t *msgp = (uint8_t *)as_msg_make_response_msg(tr->result_code,
				tr->generation, tr->void_time, NULL, &bin, 1, ns, NULL, &msg_sz,
				as_transaction_trid(tr), NULL);

		if (! msgp)	{
			cf_warning_digest(AS_RW, &tr->keyd, "{%s} UDF failed to make response msg ", ns->name);

			if (particle_buf != stack_particle) {
				cf_free(particle_buf);
			}

			return -1;
		}

		// Stash the message, to be sent later.
		db->buf = msgp;
		db->is_stack = false;
		db->alloc_sz = msg_sz;
		db->used_sz = msg_sz;
	}
	else {
		single_transaction_response(tr, ns, NULL, &bin, 1, tr->generation, tr->void_time, NULL, NULL);
	}

	if (particle_buf != stack_particle) {
		cf_free(particle_buf);
	}

	return 0;
}