예제 #1
0
int
as_proxy_send_ops_response(cf_node dst, msg *m, cf_dyn_buf *db)
{
	uint32_t tid;
	msg_get_uint32(m, PROXY_FIELD_TID, &tid);

#ifdef DEBUG
	cf_debug(AS_PROXY, "proxy send response: message %p bytearray %p tid %d", m, result_code, tid);
#endif

	msg_reset(m);

	msg_set_uint32(m, PROXY_FIELD_OP, PROXY_OP_RESPONSE);
	msg_set_uint32(m, PROXY_FIELD_TID, tid);

	uint8_t *msgp = db->buf;
	size_t msg_sz = db->used_sz;

	if (db->is_stack) {
		msg_set_buf(m, PROXY_FIELD_AS_PROTO, msgp, msg_sz, MSG_SET_COPY);
	}
	else {
		msg_set_buf(m, PROXY_FIELD_AS_PROTO, msgp, msg_sz, MSG_SET_HANDOFF_MALLOC);
		db->buf = NULL; // the fabric owns the buffer now
	}

	int rv = as_fabric_send(dst, m, AS_FABRIC_PRIORITY_MEDIUM);
	if (rv != 0) {
		cf_debug(AS_PROXY, "sending proxy response: fabric send err %d, catch you on the retry", rv);
		as_fabric_msg_put(m);
	}

	return 0;
} // end as_proxy_send_ops_response()
예제 #2
0
// Looked up the message in the store. Time to send the response value back to
// the requester. The CF_BYTEARRAY is handed off in this case. If you want to
// keep a reference, then keep the reference yourself.
int
as_proxy_send_response(cf_node dst, msg *m, uint32_t result_code, uint32_t generation,
		uint32_t void_time, as_msg_op **ops, as_bin **bins, uint16_t bin_count,
		as_namespace *ns, uint64_t trid, const char *setname)
{
	uint32_t tid;
	msg_get_uint32(m, PROXY_FIELD_TID, &tid);

#ifdef DEBUG
	cf_debug(AS_PROXY, "proxy send response: message %p bytearray %p tid %d", m, result_code, tid);
#endif

	msg_reset(m);

	msg_set_uint32(m, PROXY_FIELD_OP, PROXY_OP_RESPONSE);
	msg_set_uint32(m, PROXY_FIELD_TID, tid);

	size_t msg_sz = 0;
	cl_msg * msgp = as_msg_make_response_msg(result_code, generation, void_time, ops,
			bins, bin_count, ns, 0, &msg_sz, trid, setname);

	msg_set_buf(m, PROXY_FIELD_AS_PROTO, (byte *) msgp, msg_sz, MSG_SET_HANDOFF_MALLOC);

	int rv = as_fabric_send(dst, m, AS_FABRIC_PRIORITY_MEDIUM);
	if (rv != 0) {
		cf_debug(AS_PROXY, "sending proxy response: fabric send err %d, catch you on the retry", rv);
		as_fabric_msg_put(m);
	}

	return 0;
} // end as_proxy_send_response()
예제 #3
0
void
send_rw_messages_forget(rw_request* rw)
{
	for (uint32_t i = 0; i < rw->n_dest_nodes; i++) {
		msg_incr_ref(rw->dest_msg);

		if (as_fabric_send(rw->dest_nodes[i], rw->dest_msg,
				AS_FABRIC_CHANNEL_RW) != AS_FABRIC_SUCCESS) {
			as_fabric_msg_put(rw->dest_msg);
		}
	}
}
예제 #4
0
// For LDTs only:
void
send_multiop_ack(cf_node node, msg* m, uint32_t result)
{
	msg_preserve_fields(m, 3, RW_FIELD_NS_ID, RW_FIELD_DIGEST, RW_FIELD_TID);

	msg_set_uint32(m, RW_FIELD_OP, RW_OP_MULTI_ACK);
	msg_set_uint32(m, RW_FIELD_RESULT, result);

	if (as_fabric_send(node, m, AS_FABRIC_PRIORITY_MEDIUM) !=
			AS_FABRIC_SUCCESS) {
		as_fabric_msg_put(m);
	}
}
예제 #5
0
void
send_rw_messages(rw_request* rw)
{
	for (uint32_t i = 0; i < rw->n_dest_nodes; i++) {
		if (rw->dest_complete[i]) {
			continue;
		}

		msg_incr_ref(rw->dest_msg);

		if (as_fabric_send(rw->dest_nodes[i], rw->dest_msg,
				AS_FABRIC_CHANNEL_RW) != AS_FABRIC_SUCCESS) {
			as_fabric_msg_put(rw->dest_msg);
			rw->xmit_ms = 0; // force a retransmit on next cycle
		}
	}
}
예제 #6
0
// Send a redirection message - consumes the message.
int
as_proxy_send_redirect(cf_node dst, msg *m, cf_node rdst)
{
	int rv;
	uint32_t tid;
	msg_get_uint32(m, PROXY_FIELD_TID, &tid);

	msg_reset(m);
	msg_set_uint32(m, PROXY_FIELD_OP, PROXY_OP_REDIRECT);
	msg_set_uint32(m, PROXY_FIELD_TID, tid);
	msg_set_uint64(m, PROXY_FIELD_REDIRECT, rdst);

	if (0 != (rv = as_fabric_send(dst, m, AS_FABRIC_PRIORITY_MEDIUM))) {
		cf_debug(AS_PROXY, "sending redirection failed: fabric send error %d", rv);
		as_fabric_msg_put(m);
	}

	return 0;
} // end as_proxy_send_redirect()
예제 #7
0
void
immigration_handle_meta_batch_ack(cf_node src, msg *m)
{
	cf_warning(AS_MIGRATE, "CE node received meta-batch ack - unexpected");
	as_fabric_msg_put(m);
}
예제 #8
0
// Incoming messages start here.
// - Could get a request that we need to service.
// - Could get a response to one of our requests - need to find the request and
//   send the real response to the remote end.
int
proxy_msg_fn(cf_node id, msg *m, void *udata)
{
	int rv;

	if (cf_rc_count((void*)m) == 0) {
		cf_debug(AS_PROXY, " proxy_msg_fn was given a refcount 0 message! Someone has been naugty %p", m);
		return -1;
	}

	uint32_t op = 99999;
	msg_get_uint32(m, PROXY_FIELD_OP, &op);
	uint32_t transaction_id = 0;
	msg_get_uint32(m, PROXY_FIELD_TID, &transaction_id);

	cf_detail(AS_PROXY, "received proxy message: tid %d type %d from %"PRIx64, transaction_id, op, id);

	switch (op) {
		case PROXY_OP_REQUEST:
		{
			cf_atomic_int_incr(&g_config.proxy_action);

#ifdef DEBUG
			cf_debug(AS_PROXY, "Proxy_msg: received request");
#ifdef DEBUG_VERBOSE
			msg_dump(m, "incoming proxy msg");
#endif
#endif
			cf_digest *key;
			size_t sz = 0;
			if (0 != msg_get_buf(m, PROXY_FIELD_DIGEST, (byte **) &key, &sz, MSG_GET_DIRECT)) {
				cf_info(AS_PROXY, "proxy msg function: no digest, problem");
				as_fabric_msg_put(m);
				return 0;
			}
			cl_msg *msgp;
			size_t as_msg_sz = 0;
			if (0 != msg_get_buf(m, PROXY_FIELD_AS_PROTO, (byte **) &msgp, &as_msg_sz, MSG_GET_COPY_MALLOC)) {
				cf_info(AS_PROXY, "proxy msg function: no as msg, problem");
				as_fabric_msg_put(m);
				return 0;
			}

			uint64_t cluster_key = 0;
			if (0 != msg_get_uint64(m, PROXY_FIELD_CLUSTER_KEY, &cluster_key)) {
				cf_info(AS_PROXY, "proxy msg function: no cluster key, problem");
				as_fabric_msg_put(m);
				return 0;
			}

			// This is allowed to fail - this is a new field, and gets defaulted
			// to 0 if it doesn't exist.
			uint32_t timeout_ms = 0;
			msg_get_uint32(m, PROXY_FIELD_TIMEOUT_MS, &timeout_ms);
//			cf_info(AS_PROXY, "proxy msg: received timeout_ms of %d",timeout_ms);

			// Put the as_msg on the normal queue for processing.
			// INIT_TR
			as_transaction tr;
			as_transaction_init(&tr, key, msgp);
			tr.incoming_cluster_key = cluster_key;
			tr.end_time             = (timeout_ms != 0) ? ((uint64_t)timeout_ms * 1000000) + tr.start_time : 0;
			tr.proxy_node           = id;
			tr.proxy_msg            = m;

			// Check here if this is shipped op.
			uint32_t info = 0;
			msg_get_uint32(m, PROXY_FIELD_INFO, &info);
			if (info & PROXY_INFO_SHIPPED_OP) {
				tr.flag |= AS_TRANSACTION_FLAG_SHIPPED_OP;
				cf_detail_digest(AS_PROXY, &tr.keyd, "SHIPPED_OP WINNER Operation Received");
			} else {
				cf_detail_digest(AS_PROXY, &tr.keyd, "Received Proxy Request digest tid(%d)", tr.trid);
			}

			MICROBENCHMARK_RESET();

			thr_tsvc_enqueue(&tr);
		}
		break;

		case PROXY_OP_RESPONSE:
		{
#ifdef DEBUG
			// Got the response from the actual endpoint.
			cf_debug(AS_PROXY, " proxy: received response! tid %d node %"PRIx64, transaction_id, id);
#ifdef DEBUG_VERBOSE
			msg_dump(m, "incoming proxy response");
#endif
#endif

			// Look up the element.
			proxy_request pr;
			bool free_msg = true;
			if (SHASH_OK == shash_get_and_delete(g_proxy_hash, &transaction_id, &pr)) {
				// Found the element (sometimes we get two acks so it's OK for
				// an ack to not find the transaction).

				if (pr.wr) {
					as_proxy_shipop_response_hdlr(m, &pr, &free_msg);
				} else {
					as_proto *proto;
					size_t proto_sz;
					if (0 != msg_get_buf(m, PROXY_FIELD_AS_PROTO, (byte **) &proto, &proto_sz, MSG_GET_DIRECT)) {
						cf_info(AS_PROXY, "msg get buf failed!");
					}

#ifdef DEBUG_VERBOSE
					cf_debug(AS_PROXY, "proxy: sending proto response: ptr %p sz %"PRIu64" %d", proto, proto_sz, pr.fd);
					for (size_t _i = 0; _i < proto_sz; _i++) {
						fprintf(stderr, " %x", ((byte *)proto)[_i]);
						if (_i % 16 == 15) {
							fprintf(stderr, "\n");
						}
					}
#endif

#ifdef EXTRA_CHECKS
					as_proto proto_copy = *proto;
					as_proto_swap(&proto_copy);
					if (proto_copy.sz + 8 != proto_sz) {
						cf_info(AS_PROXY, "BONE BONE BONE!!!");
						cf_info(AS_PROXY, "proto sz: %"PRIu64" sz %u", (uint64_t) proto_copy.sz, proto_sz);
					}
#endif

					// Write to the file descriptor.
					cf_detail(AS_PROXY, "direct write fd %d", pr.fd_h->fd);
					cf_assert(pr.fd_h->fd, AS_PROXY, CF_WARNING, "attempted write to fd 0");

					if (pr.batch_shared) {
						cf_digest* digest;
						size_t digest_sz = 0;

						if (msg_get_buf(pr.fab_msg, PROXY_FIELD_DIGEST, (byte **)&digest, &digest_sz, MSG_GET_DIRECT) == 0) {
							as_batch_add_proxy_result(pr.batch_shared, pr.batch_index, digest, (cl_msg*)proto, proto_sz);
							as_proxy_set_stat_counters(0);
						}
						else {
							cf_warning(AS_PROXY, "Failed to find batch proxy digest %u", transaction_id);
							as_batch_add_error(pr.batch_shared, pr.batch_index, AS_PROTO_RESULT_FAIL_UNKNOWN);
							as_proxy_set_stat_counters(-1);
						}
						cf_hist_track_insert_data_point(g_config.px_hist, pr.start_time);
					}
					else {
						size_t pos = 0;
						while (pos < proto_sz) {
							rv = send(pr.fd_h->fd, (((uint8_t *)proto) + pos), proto_sz - pos, MSG_NOSIGNAL);
							if (rv > 0) {
								pos += rv;
							}
							else if (rv < 0) {
								if (errno != EWOULDBLOCK) {
									// Common message when a client aborts.
									cf_debug(AS_PROTO, "protocol proxy write fail: fd %d sz %d pos %d rv %d errno %d", pr.fd_h->fd, proto_sz, pos, rv, errno);
									shutdown(pr.fd_h->fd, SHUT_RDWR);
									as_proxy_set_stat_counters(-1);
									goto SendFin;
								}
								usleep(1); // yield
							}
							else {
								cf_info(AS_PROTO, "protocol write fail zero return: fd %d sz %d pos %d ", pr.fd_h->fd, proto_sz, pos);
								shutdown(pr.fd_h->fd, SHUT_RDWR);
								as_proxy_set_stat_counters(-1);
								goto SendFin;
							}
						}
						as_proxy_set_stat_counters(0);
SendFin:
						cf_hist_track_insert_data_point(g_config.px_hist, pr.start_time);

						// Return the fabric message or the direct file descriptor -
						// after write and complete.
						pr.fd_h->t_inprogress = false;
						AS_RELEASE_FILE_HANDLE(pr.fd_h);
						pr.fd_h = 0;
					}
					as_fabric_msg_put(pr.fab_msg);
					pr.fab_msg = 0;
				}
			}
			else {
				cf_debug(AS_PROXY, "proxy: received result but no transaction, tid %d", transaction_id);
				as_proxy_set_stat_counters(-1);
			}

			if (free_msg) {
				as_fabric_msg_put(m);
			}
		}
		break;

		case PROXY_OP_REDIRECT:
		{
			// Sometimes the destination we proxied a request to isn't able to
			// satisfy it (for example, their copy of the partition in question
			// might be desync).
			cf_node new_dst = 0;
			msg_get_uint64(m, PROXY_FIELD_REDIRECT, &new_dst);
			cf_detail(AS_PROXY, "proxy redirect message: transaction %d to node %"PRIx64, transaction_id, new_dst);

			// Look in the proxy retransmit hash for the tid.
			proxy_request *pr;
			pthread_mutex_t *pr_lock;
			int r = 0;
			if (0 != (r = shash_get_vlock(g_proxy_hash, &transaction_id, (void **)&pr, &pr_lock))) {
				cf_debug(AS_PROXY, "redirect: could not find transaction %d", transaction_id);
				as_fabric_msg_put(m);
				return -1;
			}

			if (g_config.self_node == new_dst) {

				// Although we don't know we're the final destination, undo the
				// proxy-nature and put back on the main queue. Dangerous, as it
				// leaves open the possibility of a looping message.

				cf_digest *key;
				size_t sz = 0;
				if (0 != msg_get_buf(pr->fab_msg, PROXY_FIELD_DIGEST, (byte **) &key, &sz, MSG_GET_DIRECT)) {
					cf_warning(AS_PROXY, "op_redirect: proxy msg function: no digest, problem");
					pthread_mutex_unlock(pr_lock);
					as_fabric_msg_put(m);
					return -1;
				}

				cl_msg *msgp;
				sz = 0;
				if (0 != msg_get_buf(pr->fab_msg, PROXY_FIELD_AS_PROTO, (byte **) &msgp, &sz, MSG_GET_COPY_MALLOC)) {
					cf_warning(AS_PROXY, "op_redirect: proxy msg function: no as proto, problem");
					pthread_mutex_unlock(pr_lock);
					as_fabric_msg_put(m);
					return -1;
				}

				// Put the as_msg on the normal queue for processing.
				// INIT_TR
				as_transaction tr;
				as_transaction_init(&tr, key, msgp);
				tr.start_time = pr->start_time; // start time
				tr.end_time   = pr->end_time;
				tr.proto_fd_h = pr->fd_h;
				tr.batch_shared = pr->batch_shared;
				tr.batch_index = pr->batch_index;

				MICROBENCHMARK_RESET();

				thr_tsvc_enqueue(&tr);

				as_fabric_msg_put(pr->fab_msg);
				shash_delete_lockfree(g_proxy_hash, &transaction_id);
			}
			else {
				// Change the destination, update the retransmit time.
				pr->dest = new_dst;
				pr->xmit_ms = cf_getms() + 1;

				// Send it.
				msg_incr_ref(pr->fab_msg);
				if (0 != (rv = as_fabric_send(pr->dest, pr->fab_msg, AS_FABRIC_PRIORITY_MEDIUM))) {
					cf_debug(AS_PROXY, "redirect: change destination: %"PRIx64" send error %d", pr->dest, rv);
					as_fabric_msg_put(pr->fab_msg);
				}
			}

			pthread_mutex_unlock(pr_lock);
		}
		as_fabric_msg_put(m);
		break;
		default:
			cf_debug(AS_PROXY, "proxy_msg_fn: received unknown, unsupported message %d from remote endpoint", op);
			msg_dump(m, "proxy received unknown msg");
			as_fabric_msg_put(m);
			break;
	} // end switch

	return 0;
} // end proxy_msg_fn()
예제 #9
0
/*
 * The work horse function to process the acknowledgment for the duplicate op.
 * It is received after the intended node has finished performing the op. In
 * case of success the op would have been successfully performed and replicated.
 * In case of failure the op would not have been performed anywhere.
 *
 * The retransmit is handled by making sure op hangs from the write hash as long
 * as it is not applied or failed. Any attempt to perform next operation has to
 * hang behind it unless it is finished. Also operation is assigned a timestamp
 * so that there is some protection in case the op arrives out of order, or the
 * same op comes back again. That would be a duplicate op ...
 *
 * Received a op message - I'm a winner duplicate on this partition. Perform the
 * UDF op and replicate to all the nodes in the replica list. We only replicate
 * the subrecord if the partition is in subrecord migration phase. If not, ship
 * both subrecord and record. In case partition is read replica on this node, do
 * the write and signal back that I'm done.
 *
 * THUS - PROLE SIDE
 *
 * is_write is misnamed. Differentiates between the 'duplicate' phase and the
 *    'operation' phase. If is_write == false, we're in the 'duplicate' phase.
 *
 * Algorithm
 *
 * This code is called when op is shipped to the winner node.
 *
 * 1. Assert that current node is indeed the winner node.
 * 2. Assert the cluster key matches.
 * 3. Create a transaction and apply the UDF. Create an internal transaction and
 *    make sure it does some sort of reservation and applies the write and
 *    replicates to replica set. Once the write is done it sends the op ack.
 *
 *    TODO: How do you handle retransmits?
 *    TODO: How do you handle partition reservation? Is it something special.
 *    TODO: How to send along with replication request? Same infra should be
 *          used by normal replication as well.
 *
 *    There won't be any deadlock because the requests are triggered from the
 *    write. Get down to the udf apply code. Replicate to replica set and then
 *    make sure the response is sent back to the originating node. This node has
 *    to make sure the replication actually did succeed.
 *
 * In the response code you need to add the callback function.
 */
int
as_proxy_shipop_response_hdlr(msg *m, proxy_request *pr, bool *free_msg)
{
	int rv            = -1;
	write_request *wr = pr->wr;
	if (!wr) {
		return -1;
	}
	cf_assert((pr->fd_h == NULL), AS_PROXY, CF_WARNING, "fd_h set for shipop proxy response");

	// If there is a write request hanging from pr then this is a response to
	// the proxy ship op request. This node is the resolving node (node @ which
	// duplicate resolution was triggered). It could be:
	// 1. Originating node [where the request was sent from the client] - in
	//    that case send response back to the client directly.
	// 2. Non-originating node [where the request arrived as a regular proxy] -
	//    in that case send response back to the proxy originating node.

	// Case 1: Non-originating node.
	if (wr->proxy_msg) {
		// Remember that "digest" gets printed at the end of cf_detail_digest().
		// Fake the ORIGINATING Proxy tid
		uint32_t transaction_id = 0;
		msg_get_uint32(wr->proxy_msg, PROXY_FIELD_TID, &transaction_id);
		msg_set_uint32(m, PROXY_FIELD_TID, transaction_id);
		cf_detail_digest(AS_PROXY, &(wr->keyd), "SHIPPED_OP NON-ORIG :: Got Op Response(%p) :", wr);
		cf_detail_digest(AS_PROXY, &(wr->keyd), "SHIPPED_OP NON-ORIG :: Back Forwarding Response for tid (%d). : ", transaction_id);
		if (0 != (rv = as_fabric_send(wr->proxy_node, m, AS_FABRIC_PRIORITY_MEDIUM))) {
			cf_detail_digest(AS_PROXY, &wr->keyd, "SHIPPED_OP NONORIG Failed Forwarding Response");
			as_fabric_msg_put(m);
		}
		*free_msg = false;
	}
	// Case 2: Originating node.
	else {
		cf_detail_digest(AS_PROXY, &wr->keyd, "SHIPPED_OP ORIG Got Op Response");
		pthread_mutex_lock(&wr->lock);
		if (wr->proto_fd_h) {
			if (!wr->proto_fd_h->fd) {
				cf_warning_digest(AS_PROXY, &wr->keyd, "SHIPPED_OP ORIG Missing fd in proto_fd ");
			}
			else {
				as_proto *proto;
				size_t proto_sz;
				if (0 != msg_get_buf(m, PROXY_FIELD_AS_PROTO, (byte **) &proto, &proto_sz, MSG_GET_DIRECT)) {
					cf_info(AS_PROXY, "msg get buf failed!");
				}
				size_t pos = 0;
				while (pos < proto_sz) {
					rv = send(wr->proto_fd_h->fd, (((uint8_t *)proto) + pos), proto_sz - pos, MSG_NOSIGNAL);
					if (rv > 0) {
						pos += rv;
					}
					else if (rv < 0) {
						if (errno != EWOULDBLOCK) {
							// Common message when a client aborts.
							cf_debug(AS_PROTO, "protocol proxy write fail: fd %d "
									"sz %d pos %d rv %d errno %d",
									wr->proto_fd_h->fd, proto_sz, pos, rv, errno);
							shutdown(wr->proto_fd_h->fd, SHUT_RDWR);
							break;
						}
						usleep(1); // yield
					}
					else {
						cf_info(AS_PROTO, "protocol write fail zero return: fd %d sz %d pos %d ",
								wr->proto_fd_h->fd, proto_sz, pos);
						shutdown(wr->proto_fd_h->fd, SHUT_RDWR);
						break;
					}
				}
				cf_detail_digest(AS_PROXY, &wr->keyd, "SHIPPED_OP ORIG Response Sent to Client");
			}
			wr->proto_fd_h->t_inprogress = false;
			AS_RELEASE_FILE_HANDLE(wr->proto_fd_h);
			wr->proto_fd_h = 0;
		} else {
			// this may be NULL if the request has already timedout and the wr proto_fd_h
			// will be cleaned up by then
			cf_detail_digest(AS_PROXY, &wr->keyd, "SHIPPED_OP ORIG Missing proto_fd ");

			// Note: This may be needed if this is node where internal scan or query
			// UDF is initiated where it happens so that there is migration is going
			// on and the request get routed to the remote node which is winning node
			// This request may need the req_cb to be called.
			if (udf_rw_needcomplete_wr(wr)) {
				as_transaction tr;
				write_request_init_tr(&tr, wr);
				udf_rw_complete(&tr, 0, __FILE__, __LINE__);
				if (tr.proto_fd_h) {
					tr.proto_fd_h->t_inprogress = false;
					AS_RELEASE_FILE_HANDLE(tr.proto_fd_h);
					tr.proto_fd_h = 0;
				}
			}
		}
		pthread_mutex_unlock(&wr->lock);
	}

	// This node is shipOp initiator. Remove it from the Global
	// hash
	global_keyd gk;
	gk.ns_id = wr->rsv.ns->id;
	gk.keyd = wr->keyd;
	g_write_hash_delete(&gk);

	WR_RELEASE(pr->wr);
	pr->wr = NULL;
	return 0;
}
예제 #10
0
int
as_proxy_shipop(cf_node dst, write_request *wr)
{
	as_partition_id pid = as_partition_getid(wr->keyd);

	if (dst == 0) {
		cf_crash(AS_PROXY, "the destination should never be zero");
	}

	// Create a fabric message, fill it out.
	msg *m = as_fabric_msg_get(M_TYPE_PROXY);
	if (!m)	{
		return -1;
	}

	uint32_t tid = cf_atomic32_incr(&g_proxy_tid);

	msg_set_uint32(m, PROXY_FIELD_OP, PROXY_OP_REQUEST);
	msg_set_uint32(m, PROXY_FIELD_TID, tid);
	msg_set_buf(m, PROXY_FIELD_DIGEST, (void *) &wr->keyd, sizeof(cf_digest), MSG_SET_COPY);
	msg_set_buf(m, PROXY_FIELD_AS_PROTO, (void *) wr->msgp, as_proto_size_get(&wr->msgp->proto), MSG_SET_HANDOFF_MALLOC);
	msg_set_uint64(m, PROXY_FIELD_CLUSTER_KEY, as_paxos_get_cluster_key());
	msg_set_uint32(m, PROXY_FIELD_TIMEOUT_MS, wr->msgp->msg.transaction_ttl);
	wr->msgp = 0;

	// If it is shipped op.
	uint32_t info = 0;
	info |= PROXY_INFO_SHIPPED_OP;
	msg_set_uint32(m, PROXY_FIELD_INFO, info);

	cf_detail_digest(AS_PROXY, &wr->keyd, "SHIPPED_OP %s->WINNER msg %p Proxy Sent to %"PRIx64" %p tid(%d)",
			wr->proxy_msg ? "NONORIG" : "ORIG", m, dst, wr, tid);

	// Fill out a retransmit structure, insert into the retransmit hash.
	msg_incr_ref(m);
	proxy_request pr;
	pr.start_time  = wr->start_time;
	pr.end_time    = (wr->end_time != 0) ? wr->end_time : pr.start_time + g_config.transaction_max_ns;
	cf_rc_reserve(wr);
	pr.wr          = wr;
	pr.fab_msg     = m;
	pr.xmit_ms     = cf_getms() + g_config.transaction_retry_ms;
	pr.retry_interval_ms = g_config.transaction_retry_ms;
	pr.dest        = dst;
	pr.pid         = pid;
	pr.fd_h        = NULL;
	pr.batch_shared = NULL;
	pr.batch_index = 0;

	if (0 != shash_put(g_proxy_hash, &tid, &pr)) {
		cf_info(AS_PROXY, " shash_put failed, need cleanup code");
		return -1;
	}

	// Send to the remote node.
	int rv = as_fabric_send(dst, m, AS_FABRIC_PRIORITY_MEDIUM);
	if (rv != 0) {
		cf_detail(AS_PROXY, "SHIPPED_OP ORIG [Digest %"PRIx64"] Failed with %d", *(uint64_t *)&wr->keyd, rv);
		as_fabric_msg_put(m);
	}

	wr->shipped_op_initiator = true;
	cf_atomic_int_incr(&g_config.ldt_proxy_initiate);

	return 0;
}
예제 #11
0
// Make a request to another node.
//
// Note: there's a cheat here. 'as_msg' is used in a raw form, and includes
// structured data (version - type - nfields - sz ...) which should be made more
// wire-protocol-friendly.
int
as_proxy_divert(cf_node dst, as_transaction *tr, as_namespace *ns, uint64_t cluster_key)
{
	cf_detail(AS_PROXY, "proxy divert");

	cf_atomic_int_incr(&g_config.stat_proxy_reqs);
	if (tr->msgp && (tr->msgp->msg.info1 & AS_MSG_INFO1_XDR)) {
		cf_atomic_int_incr(&g_config.stat_proxy_reqs_xdr);
	}
	as_partition_id pid = as_partition_getid(tr->keyd);

	if (dst == 0) {
		// Get the list of replicas.
		dst = as_partition_getreplica_read(ns, pid);
	}

	// Create a fabric message, fill it out.
	msg *m = as_fabric_msg_get(M_TYPE_PROXY);
	if (!m)	{
		return -1;
	}

	uint32_t tid = cf_atomic32_incr(&g_proxy_tid);

	msg_set_uint32(m, PROXY_FIELD_OP, PROXY_OP_REQUEST);
	msg_set_uint32(m, PROXY_FIELD_TID, tid);
	msg_set_buf(m, PROXY_FIELD_DIGEST, (void *) &tr->keyd, sizeof(cf_digest), MSG_SET_COPY);
	msg_set_type msettype = tr->batch_shared ? MSG_SET_COPY : MSG_SET_HANDOFF_MALLOC;
	msg_set_buf(m, PROXY_FIELD_AS_PROTO, (void *) tr->msgp, as_proto_size_get(&tr->msgp->proto), msettype);
	msg_set_uint64(m, PROXY_FIELD_CLUSTER_KEY, cluster_key);
	msg_set_uint32(m, PROXY_FIELD_TIMEOUT_MS, tr->msgp->msg.transaction_ttl);

	tr->msgp = 0;

	cf_debug_digest(AS_PROXY, &tr->keyd, "proxy_divert: fab_msg %p dst %"PRIx64, m, dst);

	// Fill out a retransmit structure, insert into the retransmit hash.
	msg_incr_ref(m);
	proxy_request pr;
	pr.start_time = tr->start_time;
	pr.end_time = (tr->end_time != 0) ? tr->end_time : pr.start_time + g_config.transaction_max_ns;
	pr.fd_h = tr->proto_fd_h;
	tr->proto_fd_h = 0;
	pr.fab_msg = m;
	pr.xmit_ms = cf_getms() + g_config.transaction_retry_ms;
	pr.retry_interval_ms = g_config.transaction_retry_ms;
	pr.dest = dst;
	pr.pid = pid;
	pr.ns = ns;
	pr.wr = NULL;
	pr.batch_shared = tr->batch_shared;
	pr.batch_index = tr->batch_index;

	if (0 != shash_put(g_proxy_hash, &tid, &pr)) {
		cf_debug(AS_PROXY, " shash_put failed, need cleanup code");
		return -1;
	}

	// Send to the remote node.
	int rv = as_fabric_send(dst, m, AS_FABRIC_PRIORITY_MEDIUM);
	if (rv != 0) {
		cf_debug(AS_PROXY, "as_proxy_divert: returned error %d", rv);
		as_fabric_msg_put(m);
	}

	cf_atomic_int_incr(&g_config.proxy_initiate);

	return 0;
}
예제 #12
0
// For LDTs only:
void
repl_write_handle_multiop(cf_node node, msg* m)
{
	uint8_t* ns_name;
	size_t ns_name_len;

	if (msg_get_buf(m, RW_FIELD_NAMESPACE, &ns_name, &ns_name_len,
			MSG_GET_DIRECT) != 0) {
		cf_warning(AS_RW, "handle_multiop: no namespace");
		send_multiop_ack(node, m, AS_PROTO_RESULT_FAIL_UNKNOWN);
		return;
	}

	as_namespace* ns = as_namespace_get_bybuf(ns_name, ns_name_len);

	if (! ns) {
		cf_warning(AS_RW, "handle_multiop: invalid namespace");
		send_multiop_ack(node, m, AS_PROTO_RESULT_FAIL_UNKNOWN);
		return;
	}

	cf_digest* keyd;
	size_t sz;

	if (msg_get_buf(m, RW_FIELD_DIGEST, (uint8_t**)&keyd, &sz,
			MSG_GET_DIRECT) != 0) {
		cf_warning(AS_RW, "handle_multiop: no digest");
		send_multiop_ack(node, m, AS_PROTO_RESULT_FAIL_UNKNOWN);
		return;
	}

	// Note - there should be an RW_FIELD_INFO with LDT bit set, but not
	// bothering to get it here since we never use it.

	uint8_t* pickled_buf;
	size_t pickled_sz;

	if (msg_get_buf(m, RW_FIELD_MULTIOP, (uint8_t**)&pickled_buf, &pickled_sz,
			MSG_GET_DIRECT) != 0) {
		cf_warning(AS_RW, "handle_multiop: no buffer");
		send_multiop_ack(node, m, AS_PROTO_RESULT_FAIL_UNKNOWN);
		return;
	}

	as_partition_reservation rsv;

	as_partition_reserve_migrate(ns, as_partition_getid(*keyd), &rsv, NULL);

	if (rsv.state == AS_PARTITION_STATE_ABSENT) {
		as_partition_release(&rsv);
		send_multiop_ack(node, m, AS_PROTO_RESULT_FAIL_CLUSTER_KEY_MISMATCH);
		return;
	}

	ldt_prole_info linfo;
	memset(&linfo, 1, sizeof(ldt_prole_info));

	int offset = 0;

	while (true) {
		const uint8_t* buf = (const uint8_t*)(pickled_buf + offset);
		size_t sz = pickled_sz - offset;

		if (sz == 0) {
			break;
		}

		uint32_t op_msg_len = 0;
		msg_type op_msg_type = 0;

		if (msg_get_initial(&op_msg_len, &op_msg_type, buf, sz) != 0 ||
				op_msg_type != M_TYPE_RW) {
			cf_warning(AS_RW, "handle_multiop: peek multiop msg failed");
			as_partition_release(&rsv);
			send_multiop_ack(node, m, AS_PROTO_RESULT_FAIL_UNKNOWN);
			return;
		}

		msg* op_msg = as_fabric_msg_get(op_msg_type);

		if (! op_msg) {
			cf_warning(AS_RW, "handle_multiop: can't get fabric msg");
			as_partition_release(&rsv);
			send_multiop_ack(node, m, AS_PROTO_RESULT_FAIL_UNKNOWN);
			return;
		}

		if (msg_parse(op_msg, buf, sz) != 0) {
			cf_warning(AS_RW, "handle_multiop: can't parse multiop msg");
			as_fabric_msg_put(op_msg);
			as_partition_release(&rsv);
			send_multiop_ack(node, m, AS_PROTO_RESULT_FAIL_UNKNOWN);
			return;
		}

		offset += op_msg_len;

		if (! handle_multiop_subop(node, op_msg, &rsv, &linfo)) {
			cf_warning(AS_RW, "handle_multiop: write_process_new failed");
			as_fabric_msg_put(op_msg);
			as_partition_release(&rsv);
			send_multiop_ack(node, m, AS_PROTO_RESULT_FAIL_UNKNOWN);
			return;
		}

		as_fabric_msg_put(op_msg);
	}

	as_partition_release(&rsv);
	send_multiop_ack(node, m, AS_PROTO_RESULT_OK);
}
예제 #13
0
void
repl_write_handle_ack(cf_node node, msg* m)
{
	uint32_t ns_id;

	if (msg_get_uint32(m, RW_FIELD_NS_ID, &ns_id) != 0) {
		cf_warning(AS_RW, "repl-write ack: no ns-id");
		as_fabric_msg_put(m);
		return;
	}

	cf_digest* keyd;
	size_t sz;

	if (msg_get_buf(m, RW_FIELD_DIGEST, (uint8_t**)&keyd, &sz,
			MSG_GET_DIRECT) != 0) {
		cf_warning(AS_RW, "repl-write ack: no digest");
		as_fabric_msg_put(m);
		return;
	}

	uint32_t tid;

	if (msg_get_uint32(m, RW_FIELD_TID, &tid) != 0) {
		cf_warning(AS_RW, "repl-write ack: no tid");
		as_fabric_msg_put(m);
		return;
	}

	// TODO - result_code is currently ignored! What should we do with it?
	// Note - CLUSTER_KEY_MISMATCH not special, can't re-queue transaction.
	uint32_t result_code;

	if (msg_get_uint32(m, RW_FIELD_RESULT, &result_code) != 0) {
		cf_warning(AS_RW, "repl-write ack: no result_code");
		as_fabric_msg_put(m);
		return;
	}

	rw_request_hkey hkey = { ns_id, *keyd };
	rw_request* rw = rw_request_hash_get(&hkey);

	if (! rw) {
		// Extra ack, after rw_request is already gone.
		as_fabric_msg_put(m);
		return;
	}

	pthread_mutex_lock(&rw->lock);

	if (rw->tid != tid) {
		// Extra ack, rw_request is that of newer transaction for same digest.
		pthread_mutex_unlock(&rw->lock);
		rw_request_release(rw);
		as_fabric_msg_put(m);
		return;
	}

	int i;

	for (i = 0; i < rw->n_dest_nodes; i++) {
		if (rw->dest_nodes[i] != node) {
			continue;
		}

		if (rw->dest_complete[i]) {
			// Extra ack for this replica write.
			pthread_mutex_unlock(&rw->lock);
			rw_request_release(rw);
			as_fabric_msg_put(m);
			return;
		}

		rw->dest_complete[i] = true;

		break;
	}

	if (i == rw->n_dest_nodes) {
		cf_warning(AS_RW, "repl-write ack: from non-dest node %lx", node);
		pthread_mutex_unlock(&rw->lock);
		rw_request_release(rw);
		as_fabric_msg_put(m);
		return;
	}

	for (int j = 0; j < rw->n_dest_nodes; j++) {
		if (! rw->dest_complete[j]) {
			// Still haven't heard from all duplicates.
			pthread_mutex_unlock(&rw->lock);
			rw_request_release(rw);
			as_fabric_msg_put(m);
			return;
		}
	}

	if (! rw->from.any && rw->origin != FROM_NSUP &&
			! rw->respond_client_on_master_completion) {
		// Lost race against timeout in retransmit thread.
		pthread_mutex_unlock(&rw->lock);
		rw_request_release(rw);
		as_fabric_msg_put(m);
		return;
	}

	if (! rw->respond_client_on_master_completion) {
		rw->repl_write_cb(rw);
	}

	pthread_mutex_unlock(&rw->lock);

	rw_request_hash_delete(&hkey, rw);
	rw_request_release(rw);
	as_fabric_msg_put(m);
}