Ejemplo n.º 1
0
/*
 * Main routine to replicate the chunks of LDT objects. The LDT directory rec
 * is not replicated using this function. This function is called for each chunk
 * that got updated as part of the single LDT operation. Note that in a single
 * LDT operation, there can be only few chunks that change. i.e chunks in one
 * path of the tree structure.
 *
 * Assumption:
 * 1. All records should have been closed.
 * 2. Pickled buf for all the record and subrecord which needs shipping should have
 * 	  been filled.
 *
 * Function:
 *
 * 1. Walk through each sub record and use its pickled buf to create
 *    RW_OP_WRITE. Pack it in the buffer and push it into the RW_MULTI_OP
 *    packet.
 * 2. This function packs entire pickled buf into the message that is one extra
 *    allocation into the multi-op over the fabric. The message hangs from the
 *    wr for the parent record for the retransmit
 */
int
ldt_record_pickle(ldt_record *lrecord,
				  uint8_t               ** pickled_buf,
				  size_t                 * pickled_sz,
				  uint32_t               * pickled_void_time)
{
	cf_detail(AS_LDT, "Enter: MULTI_OP: Packing LDT record");

	udf_record *h_urecord  = as_rec_source(lrecord->h_urec);
	as_transaction   *h_tr = h_urecord->tr;

	// Do an early check if we need to replicate to other nodes. In cases like
	// single-replica or single-node we don't need to do any replication.
	cf_node dest_nodes_tmp[AS_CLUSTER_SZ];
	memset(dest_nodes_tmp, 0, sizeof(dest_nodes_tmp));
	int listsz = as_partition_getreplica_readall(h_tr->rsv.ns, h_tr->rsv.pid, dest_nodes_tmp);
	if (listsz == 0) {
		return 0;
	}

	bool is_delete       = (h_urecord->pickled_buf) ? false : true;
	int  ret             = 0;
	int  ops             = 0;
	// TODO: change hard coded 7 to meaningful constant.
	msg *m[7];
	memset(m, 0, 7 * sizeof(msg *));


	if (is_delete) {
		*pickled_buf = 0;
		*pickled_sz  = 0;
	} else {
		size_t sz     = 0;
		size_t buflen = 0;

		m[ops] = as_fabric_msg_get(M_TYPE_RW);
		if (!m[ops]) {
			ret = -3;
			goto Out;
		}
		if (!is_delete && h_urecord->pickled_buf) {
			cf_detail(AS_LDT, "MULTI_OP: Packing LDT Head Record");
			rw_msg_setup(m[ops], h_tr, &h_tr->keyd,
							&h_urecord->pickled_buf,
							h_urecord->pickled_sz,
							h_urecord->pickled_void_time,
							&h_urecord->pickled_rec_props,
							RW_OP_WRITE,
							h_urecord->ldt_rectype_bits, true);
			buflen = 0;
			msg_fillbuf(m[ops], NULL, &buflen);
			sz += buflen;
			ops++;
		}

		// This macro is a for-loop thru the SR list and a test for valid SR entry
		FOR_EACH_SUBRECORD(i, lrecord) {
			udf_record *c_urecord = &lrecord->chunk[i].c_urecord;
			is_delete             = (c_urecord->pickled_buf) ? false : true;
			as_transaction *c_tr  = c_urecord->tr;

			if ( ((!c_urecord->pickled_buf) || (c_urecord->pickled_sz <= 0)) && !is_delete ) {
				cf_warning(AS_RW, "Got an empty pickled buf while trying to "
						" replicate record with digest %"PRIx64" %p, %d, %d",
						(uint64_t *)&c_tr->keyd, pickled_buf, pickled_sz, is_delete);
				ret = -2;
				goto Out;
			}

			// if pickled_buf is there then it is a write operation
			if (!is_delete && c_urecord->pickled_buf) {
				cf_detail(AS_LDT, "MULTI_OP: Packing LDT SUB Record");
				m[ops] = as_fabric_msg_get(M_TYPE_RW);
				if (!m[ops]) {
					ret = -3;
					goto Out;
				}
				rw_msg_setup(m[ops], c_tr, &c_tr->keyd,
								&c_urecord->pickled_buf,
								c_urecord->pickled_sz,
								c_urecord->pickled_void_time,
								&c_urecord->pickled_rec_props,
								RW_OP_WRITE,
								c_urecord->ldt_rectype_bits, true);
				buflen = 0;
				msg_fillbuf(m[ops], NULL, &buflen);
				sz += buflen;
				ops++;
			}
		}

		if (sz) {
			uint8_t *buf = cf_malloc(sz);
			if (!buf) {
				pickled_sz   = 0;
				*pickled_buf = NULL;
				ret          = -1;
				goto Out;
			}
			*pickled_buf = buf;
			*pickled_sz  = sz;
			int rsz = sz;
			sz = 0;

			for (int i = 0; i < ops; i++) {
				sz = rsz - sz;
				ret = msg_fillbuf(m[i], buf, &sz);
				buf += sz;
			}
			*pickled_void_time = 0;
		}
	}
Ejemplo n.º 2
0
int
as_proxy_shipop(cf_node dst, write_request *wr)
{
	as_partition_id pid = as_partition_getid(wr->keyd);

	if (dst == 0) {
		cf_crash(AS_PROXY, "the destination should never be zero");
	}

	// Create a fabric message, fill it out.
	msg *m = as_fabric_msg_get(M_TYPE_PROXY);
	if (!m)	{
		return -1;
	}

	uint32_t tid = cf_atomic32_incr(&g_proxy_tid);

	msg_set_uint32(m, PROXY_FIELD_OP, PROXY_OP_REQUEST);
	msg_set_uint32(m, PROXY_FIELD_TID, tid);
	msg_set_buf(m, PROXY_FIELD_DIGEST, (void *) &wr->keyd, sizeof(cf_digest), MSG_SET_COPY);
	msg_set_buf(m, PROXY_FIELD_AS_PROTO, (void *) wr->msgp, as_proto_size_get(&wr->msgp->proto), MSG_SET_HANDOFF_MALLOC);
	msg_set_uint64(m, PROXY_FIELD_CLUSTER_KEY, as_paxos_get_cluster_key());
	msg_set_uint32(m, PROXY_FIELD_TIMEOUT_MS, wr->msgp->msg.transaction_ttl);
	wr->msgp = 0;

	// If it is shipped op.
	uint32_t info = 0;
	info |= PROXY_INFO_SHIPPED_OP;
	msg_set_uint32(m, PROXY_FIELD_INFO, info);

	cf_detail_digest(AS_PROXY, &wr->keyd, "SHIPPED_OP %s->WINNER msg %p Proxy Sent to %"PRIx64" %p tid(%d)",
			wr->proxy_msg ? "NONORIG" : "ORIG", m, dst, wr, tid);

	// Fill out a retransmit structure, insert into the retransmit hash.
	msg_incr_ref(m);
	proxy_request pr;
	pr.start_time  = wr->start_time;
	pr.end_time    = (wr->end_time != 0) ? wr->end_time : pr.start_time + g_config.transaction_max_ns;
	cf_rc_reserve(wr);
	pr.wr          = wr;
	pr.fab_msg     = m;
	pr.xmit_ms     = cf_getms() + g_config.transaction_retry_ms;
	pr.retry_interval_ms = g_config.transaction_retry_ms;
	pr.dest        = dst;
	pr.pid         = pid;
	pr.fd_h        = NULL;
	pr.batch_shared = NULL;
	pr.batch_index = 0;

	if (0 != shash_put(g_proxy_hash, &tid, &pr)) {
		cf_info(AS_PROXY, " shash_put failed, need cleanup code");
		return -1;
	}

	// Send to the remote node.
	int rv = as_fabric_send(dst, m, AS_FABRIC_PRIORITY_MEDIUM);
	if (rv != 0) {
		cf_detail(AS_PROXY, "SHIPPED_OP ORIG [Digest %"PRIx64"] Failed with %d", *(uint64_t *)&wr->keyd, rv);
		as_fabric_msg_put(m);
	}

	wr->shipped_op_initiator = true;
	cf_atomic_int_incr(&g_config.ldt_proxy_initiate);

	return 0;
}
Ejemplo n.º 3
0
bool
repl_write_make_message(rw_request* rw, as_transaction* tr)
{
	if (rw->dest_msg) {
		msg_reset(rw->dest_msg);
	}
	else if (! (rw->dest_msg = as_fabric_msg_get(M_TYPE_RW))) {
		return false;
	}

	as_namespace* ns = tr->rsv.ns;
	msg* m = rw->dest_msg;

	msg_set_uint32(m, RW_FIELD_OP, rw->is_multiop ? RW_OP_MULTI : RW_OP_WRITE);
	msg_set_buf(m, RW_FIELD_NAMESPACE, (uint8_t*)ns->name, strlen(ns->name),
			MSG_SET_COPY);
	msg_set_uint32(m, RW_FIELD_NS_ID, ns->id);
	msg_set_buf(m, RW_FIELD_DIGEST, (void*)&tr->keyd, sizeof(cf_digest),
			MSG_SET_COPY);
	msg_set_uint64(m, RW_FIELD_CLUSTER_KEY, tr->rsv.cluster_key);
	msg_set_uint32(m, RW_FIELD_TID, rw->tid);

	msg_set_uint32(m, RW_FIELD_GENERATION, tr->generation);
	msg_set_uint32(m, RW_FIELD_VOID_TIME, tr->void_time);
	msg_set_uint64(m, RW_FIELD_LAST_UPDATE_TIME, tr->last_update_time);

	// TODO - do we really intend to send this if the record is non-LDT?
	if (ns->ldt_enabled) {
		msg_set_buf(m, RW_FIELD_VINFOSET, (uint8_t*)&tr->rsv.p->version_info,
				sizeof(as_partition_vinfo), MSG_SET_COPY);

		if (tr->rsv.p->current_outgoing_ldt_version != 0) {
			msg_set_uint64(m, RW_FIELD_LDT_VERSION,
					tr->rsv.p->current_outgoing_ldt_version);
		}
	}

	if (rw->is_multiop) {
		msg_set_uint32(m, RW_FIELD_INFO, RW_INFO_LDT);
		msg_set_buf(m, RW_FIELD_MULTIOP, (void*)rw->pickled_buf, rw->pickled_sz,
				MSG_SET_HANDOFF_MALLOC);

		// Make sure destructor doesn't free this.
		rw->pickled_buf = NULL;

		return true;
	}

	uint32_t info = pack_info_bits(tr, rw->has_udf);

	if (rw->pickled_buf) {
		// Replica writes.

		bool is_sub;
		bool is_parent;

		as_ldt_get_property(&rw->pickled_rec_props, &is_parent, &is_sub);
		info |= pack_ldt_info_bits(tr, is_parent, is_sub);

		msg_set_buf(m, RW_FIELD_RECORD, (void*)rw->pickled_buf, rw->pickled_sz,
				MSG_SET_HANDOFF_MALLOC);

		// Make sure destructor doesn't free this.
		rw->pickled_buf = NULL;

		if (rw->pickled_rec_props.p_data) {
			msg_set_buf(m, RW_FIELD_REC_PROPS, rw->pickled_rec_props.p_data,
					rw->pickled_rec_props.size, MSG_SET_HANDOFF_MALLOC);

			// Make sure destructor doesn't free the data.
			as_rec_props_clear(&rw->pickled_rec_props);
		}
	}
	else {
		// Replica deletes.

		msg_set_buf(m, RW_FIELD_AS_MSG, (void*)tr->msgp,
				as_proto_size_get(&tr->msgp->proto), MSG_SET_COPY);

		info |= pack_ldt_info_bits(tr, false, false);
	}

	msg_set_uint32(m, RW_FIELD_INFO, info);

	return true;
}
Ejemplo n.º 4
0
// Make a request to another node.
//
// Note: there's a cheat here. 'as_msg' is used in a raw form, and includes
// structured data (version - type - nfields - sz ...) which should be made more
// wire-protocol-friendly.
int
as_proxy_divert(cf_node dst, as_transaction *tr, as_namespace *ns, uint64_t cluster_key)
{
	cf_detail(AS_PROXY, "proxy divert");

	cf_atomic_int_incr(&g_config.stat_proxy_reqs);
	if (tr->msgp && (tr->msgp->msg.info1 & AS_MSG_INFO1_XDR)) {
		cf_atomic_int_incr(&g_config.stat_proxy_reqs_xdr);
	}
	as_partition_id pid = as_partition_getid(tr->keyd);

	if (dst == 0) {
		// Get the list of replicas.
		dst = as_partition_getreplica_read(ns, pid);
	}

	// Create a fabric message, fill it out.
	msg *m = as_fabric_msg_get(M_TYPE_PROXY);
	if (!m)	{
		return -1;
	}

	uint32_t tid = cf_atomic32_incr(&g_proxy_tid);

	msg_set_uint32(m, PROXY_FIELD_OP, PROXY_OP_REQUEST);
	msg_set_uint32(m, PROXY_FIELD_TID, tid);
	msg_set_buf(m, PROXY_FIELD_DIGEST, (void *) &tr->keyd, sizeof(cf_digest), MSG_SET_COPY);
	msg_set_type msettype = tr->batch_shared ? MSG_SET_COPY : MSG_SET_HANDOFF_MALLOC;
	msg_set_buf(m, PROXY_FIELD_AS_PROTO, (void *) tr->msgp, as_proto_size_get(&tr->msgp->proto), msettype);
	msg_set_uint64(m, PROXY_FIELD_CLUSTER_KEY, cluster_key);
	msg_set_uint32(m, PROXY_FIELD_TIMEOUT_MS, tr->msgp->msg.transaction_ttl);

	tr->msgp = 0;

	cf_debug_digest(AS_PROXY, &tr->keyd, "proxy_divert: fab_msg %p dst %"PRIx64, m, dst);

	// Fill out a retransmit structure, insert into the retransmit hash.
	msg_incr_ref(m);
	proxy_request pr;
	pr.start_time = tr->start_time;
	pr.end_time = (tr->end_time != 0) ? tr->end_time : pr.start_time + g_config.transaction_max_ns;
	pr.fd_h = tr->proto_fd_h;
	tr->proto_fd_h = 0;
	pr.fab_msg = m;
	pr.xmit_ms = cf_getms() + g_config.transaction_retry_ms;
	pr.retry_interval_ms = g_config.transaction_retry_ms;
	pr.dest = dst;
	pr.pid = pid;
	pr.ns = ns;
	pr.wr = NULL;
	pr.batch_shared = tr->batch_shared;
	pr.batch_index = tr->batch_index;

	if (0 != shash_put(g_proxy_hash, &tid, &pr)) {
		cf_debug(AS_PROXY, " shash_put failed, need cleanup code");
		return -1;
	}

	// Send to the remote node.
	int rv = as_fabric_send(dst, m, AS_FABRIC_PRIORITY_MEDIUM);
	if (rv != 0) {
		cf_debug(AS_PROXY, "as_proxy_divert: returned error %d", rv);
		as_fabric_msg_put(m);
	}

	cf_atomic_int_incr(&g_config.proxy_initiate);

	return 0;
}
Ejemplo n.º 5
0
// For LDTs only:
void
repl_write_handle_multiop(cf_node node, msg* m)
{
	uint8_t* ns_name;
	size_t ns_name_len;

	if (msg_get_buf(m, RW_FIELD_NAMESPACE, &ns_name, &ns_name_len,
			MSG_GET_DIRECT) != 0) {
		cf_warning(AS_RW, "handle_multiop: no namespace");
		send_multiop_ack(node, m, AS_PROTO_RESULT_FAIL_UNKNOWN);
		return;
	}

	as_namespace* ns = as_namespace_get_bybuf(ns_name, ns_name_len);

	if (! ns) {
		cf_warning(AS_RW, "handle_multiop: invalid namespace");
		send_multiop_ack(node, m, AS_PROTO_RESULT_FAIL_UNKNOWN);
		return;
	}

	cf_digest* keyd;
	size_t sz;

	if (msg_get_buf(m, RW_FIELD_DIGEST, (uint8_t**)&keyd, &sz,
			MSG_GET_DIRECT) != 0) {
		cf_warning(AS_RW, "handle_multiop: no digest");
		send_multiop_ack(node, m, AS_PROTO_RESULT_FAIL_UNKNOWN);
		return;
	}

	// Note - there should be an RW_FIELD_INFO with LDT bit set, but not
	// bothering to get it here since we never use it.

	uint8_t* pickled_buf;
	size_t pickled_sz;

	if (msg_get_buf(m, RW_FIELD_MULTIOP, (uint8_t**)&pickled_buf, &pickled_sz,
			MSG_GET_DIRECT) != 0) {
		cf_warning(AS_RW, "handle_multiop: no buffer");
		send_multiop_ack(node, m, AS_PROTO_RESULT_FAIL_UNKNOWN);
		return;
	}

	as_partition_reservation rsv;

	as_partition_reserve_migrate(ns, as_partition_getid(*keyd), &rsv, NULL);

	if (rsv.state == AS_PARTITION_STATE_ABSENT) {
		as_partition_release(&rsv);
		send_multiop_ack(node, m, AS_PROTO_RESULT_FAIL_CLUSTER_KEY_MISMATCH);
		return;
	}

	ldt_prole_info linfo;
	memset(&linfo, 1, sizeof(ldt_prole_info));

	int offset = 0;

	while (true) {
		const uint8_t* buf = (const uint8_t*)(pickled_buf + offset);
		size_t sz = pickled_sz - offset;

		if (sz == 0) {
			break;
		}

		uint32_t op_msg_len = 0;
		msg_type op_msg_type = 0;

		if (msg_get_initial(&op_msg_len, &op_msg_type, buf, sz) != 0 ||
				op_msg_type != M_TYPE_RW) {
			cf_warning(AS_RW, "handle_multiop: peek multiop msg failed");
			as_partition_release(&rsv);
			send_multiop_ack(node, m, AS_PROTO_RESULT_FAIL_UNKNOWN);
			return;
		}

		msg* op_msg = as_fabric_msg_get(op_msg_type);

		if (! op_msg) {
			cf_warning(AS_RW, "handle_multiop: can't get fabric msg");
			as_partition_release(&rsv);
			send_multiop_ack(node, m, AS_PROTO_RESULT_FAIL_UNKNOWN);
			return;
		}

		if (msg_parse(op_msg, buf, sz) != 0) {
			cf_warning(AS_RW, "handle_multiop: can't parse multiop msg");
			as_fabric_msg_put(op_msg);
			as_partition_release(&rsv);
			send_multiop_ack(node, m, AS_PROTO_RESULT_FAIL_UNKNOWN);
			return;
		}

		offset += op_msg_len;

		if (! handle_multiop_subop(node, op_msg, &rsv, &linfo)) {
			cf_warning(AS_RW, "handle_multiop: write_process_new failed");
			as_fabric_msg_put(op_msg);
			as_partition_release(&rsv);
			send_multiop_ack(node, m, AS_PROTO_RESULT_FAIL_UNKNOWN);
			return;
		}

		as_fabric_msg_put(op_msg);
	}

	as_partition_release(&rsv);
	send_multiop_ack(node, m, AS_PROTO_RESULT_OK);
}