// For LDTs only:
bool
ldt_get_info(ldt_prole_info* linfo, msg* m, as_partition_reservation* rsv)
{
	as_partition_vinfo* source_vinfo;
	size_t vinfo_sz;

	if (msg_get_buf(m, RW_FIELD_VINFOSET, (uint8_t**)&source_vinfo, &vinfo_sz,
			MSG_GET_DIRECT) != 0) {
		return false;
	}

	linfo->replication_partition_version_match =
			as_partition_vinfo_same(source_vinfo, &rsv->p->version_info);

	linfo->ldt_source_version = 0;
	linfo->ldt_source_version_set = false;

	if (msg_get_uint64(m, RW_FIELD_LDT_VERSION,
			&linfo->ldt_source_version) == 0) {
		linfo->ldt_source_version_set = true;
	}

	linfo->ldt_prole_version = 0;
	linfo->ldt_prole_version_set = false;

	return true;
}
Esempio n. 2
0
// Incoming messages start here.
// - Could get a request that we need to service.
// - Could get a response to one of our requests - need to find the request and
//   send the real response to the remote end.
int
proxy_msg_fn(cf_node id, msg *m, void *udata)
{
	int rv;

	if (cf_rc_count((void*)m) == 0) {
		cf_debug(AS_PROXY, " proxy_msg_fn was given a refcount 0 message! Someone has been naugty %p", m);
		return -1;
	}

	uint32_t op = 99999;
	msg_get_uint32(m, PROXY_FIELD_OP, &op);
	uint32_t transaction_id = 0;
	msg_get_uint32(m, PROXY_FIELD_TID, &transaction_id);

	cf_detail(AS_PROXY, "received proxy message: tid %d type %d from %"PRIx64, transaction_id, op, id);

	switch (op) {
		case PROXY_OP_REQUEST:
		{
			cf_atomic_int_incr(&g_config.proxy_action);

#ifdef DEBUG
			cf_debug(AS_PROXY, "Proxy_msg: received request");
#ifdef DEBUG_VERBOSE
			msg_dump(m, "incoming proxy msg");
#endif
#endif
			cf_digest *key;
			size_t sz = 0;
			if (0 != msg_get_buf(m, PROXY_FIELD_DIGEST, (byte **) &key, &sz, MSG_GET_DIRECT)) {
				cf_info(AS_PROXY, "proxy msg function: no digest, problem");
				as_fabric_msg_put(m);
				return 0;
			}
			cl_msg *msgp;
			size_t as_msg_sz = 0;
			if (0 != msg_get_buf(m, PROXY_FIELD_AS_PROTO, (byte **) &msgp, &as_msg_sz, MSG_GET_COPY_MALLOC)) {
				cf_info(AS_PROXY, "proxy msg function: no as msg, problem");
				as_fabric_msg_put(m);
				return 0;
			}

			uint64_t cluster_key = 0;
			if (0 != msg_get_uint64(m, PROXY_FIELD_CLUSTER_KEY, &cluster_key)) {
				cf_info(AS_PROXY, "proxy msg function: no cluster key, problem");
				as_fabric_msg_put(m);
				return 0;
			}

			// This is allowed to fail - this is a new field, and gets defaulted
			// to 0 if it doesn't exist.
			uint32_t timeout_ms = 0;
			msg_get_uint32(m, PROXY_FIELD_TIMEOUT_MS, &timeout_ms);
//			cf_info(AS_PROXY, "proxy msg: received timeout_ms of %d",timeout_ms);

			// Put the as_msg on the normal queue for processing.
			// INIT_TR
			as_transaction tr;
			as_transaction_init(&tr, key, msgp);
			tr.incoming_cluster_key = cluster_key;
			tr.end_time             = (timeout_ms != 0) ? ((uint64_t)timeout_ms * 1000000) + tr.start_time : 0;
			tr.proxy_node           = id;
			tr.proxy_msg            = m;

			// Check here if this is shipped op.
			uint32_t info = 0;
			msg_get_uint32(m, PROXY_FIELD_INFO, &info);
			if (info & PROXY_INFO_SHIPPED_OP) {
				tr.flag |= AS_TRANSACTION_FLAG_SHIPPED_OP;
				cf_detail_digest(AS_PROXY, &tr.keyd, "SHIPPED_OP WINNER Operation Received");
			} else {
				cf_detail_digest(AS_PROXY, &tr.keyd, "Received Proxy Request digest tid(%d)", tr.trid);
			}

			MICROBENCHMARK_RESET();

			thr_tsvc_enqueue(&tr);
		}
		break;

		case PROXY_OP_RESPONSE:
		{
#ifdef DEBUG
			// Got the response from the actual endpoint.
			cf_debug(AS_PROXY, " proxy: received response! tid %d node %"PRIx64, transaction_id, id);
#ifdef DEBUG_VERBOSE
			msg_dump(m, "incoming proxy response");
#endif
#endif

			// Look up the element.
			proxy_request pr;
			bool free_msg = true;
			if (SHASH_OK == shash_get_and_delete(g_proxy_hash, &transaction_id, &pr)) {
				// Found the element (sometimes we get two acks so it's OK for
				// an ack to not find the transaction).

				if (pr.wr) {
					as_proxy_shipop_response_hdlr(m, &pr, &free_msg);
				} else {
					as_proto *proto;
					size_t proto_sz;
					if (0 != msg_get_buf(m, PROXY_FIELD_AS_PROTO, (byte **) &proto, &proto_sz, MSG_GET_DIRECT)) {
						cf_info(AS_PROXY, "msg get buf failed!");
					}

#ifdef DEBUG_VERBOSE
					cf_debug(AS_PROXY, "proxy: sending proto response: ptr %p sz %"PRIu64" %d", proto, proto_sz, pr.fd);
					for (size_t _i = 0; _i < proto_sz; _i++) {
						fprintf(stderr, " %x", ((byte *)proto)[_i]);
						if (_i % 16 == 15) {
							fprintf(stderr, "\n");
						}
					}
#endif

#ifdef EXTRA_CHECKS
					as_proto proto_copy = *proto;
					as_proto_swap(&proto_copy);
					if (proto_copy.sz + 8 != proto_sz) {
						cf_info(AS_PROXY, "BONE BONE BONE!!!");
						cf_info(AS_PROXY, "proto sz: %"PRIu64" sz %u", (uint64_t) proto_copy.sz, proto_sz);
					}
#endif

					// Write to the file descriptor.
					cf_detail(AS_PROXY, "direct write fd %d", pr.fd_h->fd);
					cf_assert(pr.fd_h->fd, AS_PROXY, CF_WARNING, "attempted write to fd 0");

					if (pr.batch_shared) {
						cf_digest* digest;
						size_t digest_sz = 0;

						if (msg_get_buf(pr.fab_msg, PROXY_FIELD_DIGEST, (byte **)&digest, &digest_sz, MSG_GET_DIRECT) == 0) {
							as_batch_add_proxy_result(pr.batch_shared, pr.batch_index, digest, (cl_msg*)proto, proto_sz);
							as_proxy_set_stat_counters(0);
						}
						else {
							cf_warning(AS_PROXY, "Failed to find batch proxy digest %u", transaction_id);
							as_batch_add_error(pr.batch_shared, pr.batch_index, AS_PROTO_RESULT_FAIL_UNKNOWN);
							as_proxy_set_stat_counters(-1);
						}
						cf_hist_track_insert_data_point(g_config.px_hist, pr.start_time);
					}
					else {
						size_t pos = 0;
						while (pos < proto_sz) {
							rv = send(pr.fd_h->fd, (((uint8_t *)proto) + pos), proto_sz - pos, MSG_NOSIGNAL);
							if (rv > 0) {
								pos += rv;
							}
							else if (rv < 0) {
								if (errno != EWOULDBLOCK) {
									// Common message when a client aborts.
									cf_debug(AS_PROTO, "protocol proxy write fail: fd %d sz %d pos %d rv %d errno %d", pr.fd_h->fd, proto_sz, pos, rv, errno);
									shutdown(pr.fd_h->fd, SHUT_RDWR);
									as_proxy_set_stat_counters(-1);
									goto SendFin;
								}
								usleep(1); // yield
							}
							else {
								cf_info(AS_PROTO, "protocol write fail zero return: fd %d sz %d pos %d ", pr.fd_h->fd, proto_sz, pos);
								shutdown(pr.fd_h->fd, SHUT_RDWR);
								as_proxy_set_stat_counters(-1);
								goto SendFin;
							}
						}
						as_proxy_set_stat_counters(0);
SendFin:
						cf_hist_track_insert_data_point(g_config.px_hist, pr.start_time);

						// Return the fabric message or the direct file descriptor -
						// after write and complete.
						pr.fd_h->t_inprogress = false;
						AS_RELEASE_FILE_HANDLE(pr.fd_h);
						pr.fd_h = 0;
					}
					as_fabric_msg_put(pr.fab_msg);
					pr.fab_msg = 0;
				}
			}
			else {
				cf_debug(AS_PROXY, "proxy: received result but no transaction, tid %d", transaction_id);
				as_proxy_set_stat_counters(-1);
			}

			if (free_msg) {
				as_fabric_msg_put(m);
			}
		}
		break;

		case PROXY_OP_REDIRECT:
		{
			// Sometimes the destination we proxied a request to isn't able to
			// satisfy it (for example, their copy of the partition in question
			// might be desync).
			cf_node new_dst = 0;
			msg_get_uint64(m, PROXY_FIELD_REDIRECT, &new_dst);
			cf_detail(AS_PROXY, "proxy redirect message: transaction %d to node %"PRIx64, transaction_id, new_dst);

			// Look in the proxy retransmit hash for the tid.
			proxy_request *pr;
			pthread_mutex_t *pr_lock;
			int r = 0;
			if (0 != (r = shash_get_vlock(g_proxy_hash, &transaction_id, (void **)&pr, &pr_lock))) {
				cf_debug(AS_PROXY, "redirect: could not find transaction %d", transaction_id);
				as_fabric_msg_put(m);
				return -1;
			}

			if (g_config.self_node == new_dst) {

				// Although we don't know we're the final destination, undo the
				// proxy-nature and put back on the main queue. Dangerous, as it
				// leaves open the possibility of a looping message.

				cf_digest *key;
				size_t sz = 0;
				if (0 != msg_get_buf(pr->fab_msg, PROXY_FIELD_DIGEST, (byte **) &key, &sz, MSG_GET_DIRECT)) {
					cf_warning(AS_PROXY, "op_redirect: proxy msg function: no digest, problem");
					pthread_mutex_unlock(pr_lock);
					as_fabric_msg_put(m);
					return -1;
				}

				cl_msg *msgp;
				sz = 0;
				if (0 != msg_get_buf(pr->fab_msg, PROXY_FIELD_AS_PROTO, (byte **) &msgp, &sz, MSG_GET_COPY_MALLOC)) {
					cf_warning(AS_PROXY, "op_redirect: proxy msg function: no as proto, problem");
					pthread_mutex_unlock(pr_lock);
					as_fabric_msg_put(m);
					return -1;
				}

				// Put the as_msg on the normal queue for processing.
				// INIT_TR
				as_transaction tr;
				as_transaction_init(&tr, key, msgp);
				tr.start_time = pr->start_time; // start time
				tr.end_time   = pr->end_time;
				tr.proto_fd_h = pr->fd_h;
				tr.batch_shared = pr->batch_shared;
				tr.batch_index = pr->batch_index;

				MICROBENCHMARK_RESET();

				thr_tsvc_enqueue(&tr);

				as_fabric_msg_put(pr->fab_msg);
				shash_delete_lockfree(g_proxy_hash, &transaction_id);
			}
			else {
				// Change the destination, update the retransmit time.
				pr->dest = new_dst;
				pr->xmit_ms = cf_getms() + 1;

				// Send it.
				msg_incr_ref(pr->fab_msg);
				if (0 != (rv = as_fabric_send(pr->dest, pr->fab_msg, AS_FABRIC_PRIORITY_MEDIUM))) {
					cf_debug(AS_PROXY, "redirect: change destination: %"PRIx64" send error %d", pr->dest, rv);
					as_fabric_msg_put(pr->fab_msg);
				}
			}

			pthread_mutex_unlock(pr_lock);
		}
		as_fabric_msg_put(m);
		break;
		default:
			cf_debug(AS_PROXY, "proxy_msg_fn: received unknown, unsupported message %d from remote endpoint", op);
			msg_dump(m, "proxy received unknown msg");
			as_fabric_msg_put(m);
			break;
	} // end switch

	return 0;
} // end proxy_msg_fn()
// For LDTs only:
bool
handle_multiop_subop(cf_node node, msg* m, as_partition_reservation* rsv,
		ldt_prole_info* linfo)
{
	cf_digest* keyd;
	size_t sz;

	if (msg_get_buf(m, RW_FIELD_DIGEST, (uint8_t**)&keyd, &sz,
			MSG_GET_DIRECT) != 0) {
		cf_warning(AS_RW, "handle_multiop_subop: no digest");
		return true;
	}

	uint32_t info;

	if (msg_get_uint32(m, RW_FIELD_INFO, &info) != 0) {
		cf_warning(AS_RW, "handle_multiop_subop: no info");
		return true;
	}

	if ((info & RW_INFO_LDT) != 0 && ! ldt_get_info(linfo, m, rsv)) {
		cf_warning(AS_RW, "handle_multiop_subop: no ldt info");
		return false;
		// Will not continue! This is the only case that stops the loop.
	}

	if (! ldt_get_prole_version(rsv, keyd, linfo, info, NULL, false)) {
		// If parent cannot be due to incoming migration it's ok - continue and
		// allow subrecords to be replicated.
		return true;
	}

	// TODO - can we get here if ldt_enabled is false?
	if (rsv->ns->ldt_enabled) {
		ldt_set_prole_subrec_version(info, linfo, keyd);
	}

	cl_msg* msgp;
	size_t msgp_sz;

	uint8_t* pickled_buf;
	size_t pickled_sz;

	if (msg_get_buf(m, RW_FIELD_AS_MSG, (uint8_t**)&msgp, &msgp_sz,
			MSG_GET_DIRECT) == 0) {
		delete_replica(rsv, keyd,
				(info & (RW_INFO_LDT_SUBREC | RW_INFO_LDT_ESR)) != 0,
				(info & RW_INFO_NSUP_DELETE) != 0,
				as_msg_is_xdr(&msgp->msg),
				node);
	}
	else if (msg_get_buf(m, RW_FIELD_RECORD, (uint8_t**)&pickled_buf,
			&pickled_sz, MSG_GET_DIRECT) == 0) {
		as_generation generation;

		if (msg_get_uint32(m, RW_FIELD_GENERATION, &generation) != 0) {
			cf_warning(AS_RW, "handle_multiop_subop: no generation");
			return true;
		}

		uint32_t void_time;

		if (msg_get_uint32(m, RW_FIELD_VOID_TIME, &void_time) != 0) {
			cf_warning(AS_RW, "handle_multiop_subop: no void-time");
			return true;
		}

		uint64_t last_update_time = 0;
		// Optional - older versions won't send it.
		msg_get_uint64(m, RW_FIELD_LAST_UPDATE_TIME, &last_update_time);

		as_rec_props rec_props;
		size_t rec_props_size = 0;

		msg_get_buf(m, RW_FIELD_REC_PROPS, &rec_props.p_data, &rec_props_size,
				MSG_GET_DIRECT);
		rec_props.size = (uint32_t)rec_props_size;

		write_replica(rsv, keyd, pickled_buf, pickled_sz, &rec_props,
				generation, void_time, last_update_time, node, info, linfo);
	}
	else {
		cf_warning(AS_RW, "handle_multiop_subop: no msg or pickle");
	}

	return true;
}
void
repl_write_handle_op(cf_node node, msg* m)
{
	uint8_t* ns_name;
	size_t ns_name_len;

	if (msg_get_buf(m, RW_FIELD_NAMESPACE, &ns_name, &ns_name_len,
			MSG_GET_DIRECT) != 0) {
		cf_warning(AS_RW, "repl_write_handle_op: no namespace");
		send_repl_write_ack(node, m, AS_PROTO_RESULT_FAIL_UNKNOWN);
		return;
	}

	as_namespace* ns = as_namespace_get_bybuf(ns_name, ns_name_len);

	if (! ns) {
		cf_warning(AS_RW, "repl_write_handle_op: invalid namespace");
		send_repl_write_ack(node, m, AS_PROTO_RESULT_FAIL_UNKNOWN);
		return;
	}

	cf_digest* keyd;
	size_t sz;

	if (msg_get_buf(m, RW_FIELD_DIGEST, (uint8_t**)&keyd, &sz,
			MSG_GET_DIRECT) != 0) {
		cf_warning(AS_RW, "repl_write_handle_op: no digest");
		send_repl_write_ack(node, m, AS_PROTO_RESULT_FAIL_UNKNOWN);
		return;
	}

	as_partition_reservation rsv;

	as_partition_reserve_migrate(ns, as_partition_getid(*keyd), &rsv, NULL);

	if (rsv.state == AS_PARTITION_STATE_ABSENT) {
		as_partition_release(&rsv);
		send_repl_write_ack(node, m, AS_PROTO_RESULT_FAIL_CLUSTER_KEY_MISMATCH);
		return;
	}

	uint32_t info = 0;

	msg_get_uint32(m, RW_FIELD_INFO, &info);

	ldt_prole_info linfo;

	if ((info & RW_INFO_LDT) != 0 && ! ldt_get_info(&linfo, m, &rsv)) {
		cf_warning(AS_RW, "repl_write_handle_op: bad ldt info");
		as_partition_release(&rsv);
		send_repl_write_ack(node, m, AS_PROTO_RESULT_FAIL_UNKNOWN);
		return;
	}

	cl_msg* msgp;
	size_t msgp_sz;

	uint8_t* pickled_buf;
	size_t pickled_sz;

	uint32_t result;

	if (msg_get_buf(m, RW_FIELD_AS_MSG, (uint8_t**)&msgp, &msgp_sz,
			MSG_GET_DIRECT) == 0) {
		// <><><><><><>  Delete Operation  <><><><><><>

		// TODO - does this really need to be here? Just to fill linfo?
		if (! ldt_get_prole_version(&rsv, keyd, &linfo, info, NULL, false)) {
			as_partition_release(&rsv);
			send_repl_write_ack(node, m, AS_PROTO_RESULT_OK); // ???
			return;
		}

		result = delete_replica(&rsv, keyd,
				(info & (RW_INFO_LDT_SUBREC | RW_INFO_LDT_ESR)) != 0,
				(info & RW_INFO_NSUP_DELETE) != 0,
				as_msg_is_xdr(&msgp->msg),
				node);
	}
	else if (msg_get_buf(m, RW_FIELD_RECORD, (uint8_t**)&pickled_buf,
			&pickled_sz, MSG_GET_DIRECT) == 0) {
		// <><><><><><>  Write Pickle  <><><><><><>

		as_generation generation;

		if (msg_get_uint32(m, RW_FIELD_GENERATION, &generation) != 0) {
			cf_warning(AS_RW, "repl_write_handle_op: no generation");
			as_partition_release(&rsv);
			send_repl_write_ack(node, m, AS_PROTO_RESULT_FAIL_UNKNOWN);
			return;
		}

		uint32_t void_time;

		if (msg_get_uint32(m, RW_FIELD_VOID_TIME, &void_time) != 0) {
			cf_warning(AS_RW, "repl_write_handle_op: no void-time");
			as_partition_release(&rsv);
			send_repl_write_ack(node, m, AS_PROTO_RESULT_FAIL_UNKNOWN);
			return;
		}

		uint64_t last_update_time = 0;

		// Optional - older versions won't send it.
		msg_get_uint64(m, RW_FIELD_LAST_UPDATE_TIME, &last_update_time);

		as_rec_props rec_props;
		size_t rec_props_size = 0;

		msg_get_buf(m, RW_FIELD_REC_PROPS, &rec_props.p_data, &rec_props_size,
				MSG_GET_DIRECT);
		rec_props.size = (uint32_t)rec_props_size;

		result = write_replica(&rsv, keyd, pickled_buf, pickled_sz, &rec_props,
				generation, void_time, last_update_time, node, info, &linfo);
	}
	else {
		cf_warning(AS_RW, "repl_write_handle_op: no msg or pickle");
		result = AS_PROTO_RESULT_FAIL_UNKNOWN;
	}

	as_partition_release(&rsv);
	send_repl_write_ack(node, m, result);
}