Exemplo n.º 1
0
// Incoming messages start here.
// - Could get a request that we need to service.
// - Could get a response to one of our requests - need to find the request and
//   send the real response to the remote end.
int
proxy_msg_fn(cf_node id, msg *m, void *udata)
{
	int rv;

	if (cf_rc_count((void*)m) == 0) {
		cf_debug(AS_PROXY, " proxy_msg_fn was given a refcount 0 message! Someone has been naugty %p", m);
		return -1;
	}

	uint32_t op = 99999;
	msg_get_uint32(m, PROXY_FIELD_OP, &op);
	uint32_t transaction_id = 0;
	msg_get_uint32(m, PROXY_FIELD_TID, &transaction_id);

	cf_detail(AS_PROXY, "received proxy message: tid %d type %d from %"PRIx64, transaction_id, op, id);

	switch (op) {
		case PROXY_OP_REQUEST:
		{
			cf_atomic_int_incr(&g_config.proxy_action);

#ifdef DEBUG
			cf_debug(AS_PROXY, "Proxy_msg: received request");
#ifdef DEBUG_VERBOSE
			msg_dump(m, "incoming proxy msg");
#endif
#endif
			cf_digest *key;
			size_t sz = 0;
			if (0 != msg_get_buf(m, PROXY_FIELD_DIGEST, (byte **) &key, &sz, MSG_GET_DIRECT)) {
				cf_info(AS_PROXY, "proxy msg function: no digest, problem");
				as_fabric_msg_put(m);
				return 0;
			}
			cl_msg *msgp;
			size_t as_msg_sz = 0;
			if (0 != msg_get_buf(m, PROXY_FIELD_AS_PROTO, (byte **) &msgp, &as_msg_sz, MSG_GET_COPY_MALLOC)) {
				cf_info(AS_PROXY, "proxy msg function: no as msg, problem");
				as_fabric_msg_put(m);
				return 0;
			}

			uint64_t cluster_key = 0;
			if (0 != msg_get_uint64(m, PROXY_FIELD_CLUSTER_KEY, &cluster_key)) {
				cf_info(AS_PROXY, "proxy msg function: no cluster key, problem");
				as_fabric_msg_put(m);
				return 0;
			}

			// This is allowed to fail - this is a new field, and gets defaulted
			// to 0 if it doesn't exist.
			uint32_t timeout_ms = 0;
			msg_get_uint32(m, PROXY_FIELD_TIMEOUT_MS, &timeout_ms);
//			cf_info(AS_PROXY, "proxy msg: received timeout_ms of %d",timeout_ms);

			// Put the as_msg on the normal queue for processing.
			// INIT_TR
			as_transaction tr;
			as_transaction_init(&tr, key, msgp);
			tr.incoming_cluster_key = cluster_key;
			tr.end_time             = (timeout_ms != 0) ? ((uint64_t)timeout_ms * 1000000) + tr.start_time : 0;
			tr.proxy_node           = id;
			tr.proxy_msg            = m;

			// Check here if this is shipped op.
			uint32_t info = 0;
			msg_get_uint32(m, PROXY_FIELD_INFO, &info);
			if (info & PROXY_INFO_SHIPPED_OP) {
				tr.flag |= AS_TRANSACTION_FLAG_SHIPPED_OP;
				cf_detail_digest(AS_PROXY, &tr.keyd, "SHIPPED_OP WINNER Operation Received");
			} else {
				cf_detail_digest(AS_PROXY, &tr.keyd, "Received Proxy Request digest tid(%d)", tr.trid);
			}

			MICROBENCHMARK_RESET();

			thr_tsvc_enqueue(&tr);
		}
		break;

		case PROXY_OP_RESPONSE:
		{
#ifdef DEBUG
			// Got the response from the actual endpoint.
			cf_debug(AS_PROXY, " proxy: received response! tid %d node %"PRIx64, transaction_id, id);
#ifdef DEBUG_VERBOSE
			msg_dump(m, "incoming proxy response");
#endif
#endif

			// Look up the element.
			proxy_request pr;
			bool free_msg = true;
			if (SHASH_OK == shash_get_and_delete(g_proxy_hash, &transaction_id, &pr)) {
				// Found the element (sometimes we get two acks so it's OK for
				// an ack to not find the transaction).

				if (pr.wr) {
					as_proxy_shipop_response_hdlr(m, &pr, &free_msg);
				} else {
					as_proto *proto;
					size_t proto_sz;
					if (0 != msg_get_buf(m, PROXY_FIELD_AS_PROTO, (byte **) &proto, &proto_sz, MSG_GET_DIRECT)) {
						cf_info(AS_PROXY, "msg get buf failed!");
					}

#ifdef DEBUG_VERBOSE
					cf_debug(AS_PROXY, "proxy: sending proto response: ptr %p sz %"PRIu64" %d", proto, proto_sz, pr.fd);
					for (size_t _i = 0; _i < proto_sz; _i++) {
						fprintf(stderr, " %x", ((byte *)proto)[_i]);
						if (_i % 16 == 15) {
							fprintf(stderr, "\n");
						}
					}
#endif

#ifdef EXTRA_CHECKS
					as_proto proto_copy = *proto;
					as_proto_swap(&proto_copy);
					if (proto_copy.sz + 8 != proto_sz) {
						cf_info(AS_PROXY, "BONE BONE BONE!!!");
						cf_info(AS_PROXY, "proto sz: %"PRIu64" sz %u", (uint64_t) proto_copy.sz, proto_sz);
					}
#endif

					// Write to the file descriptor.
					cf_detail(AS_PROXY, "direct write fd %d", pr.fd_h->fd);
					cf_assert(pr.fd_h->fd, AS_PROXY, CF_WARNING, "attempted write to fd 0");

					if (pr.batch_shared) {
						cf_digest* digest;
						size_t digest_sz = 0;

						if (msg_get_buf(pr.fab_msg, PROXY_FIELD_DIGEST, (byte **)&digest, &digest_sz, MSG_GET_DIRECT) == 0) {
							as_batch_add_proxy_result(pr.batch_shared, pr.batch_index, digest, (cl_msg*)proto, proto_sz);
							as_proxy_set_stat_counters(0);
						}
						else {
							cf_warning(AS_PROXY, "Failed to find batch proxy digest %u", transaction_id);
							as_batch_add_error(pr.batch_shared, pr.batch_index, AS_PROTO_RESULT_FAIL_UNKNOWN);
							as_proxy_set_stat_counters(-1);
						}
						cf_hist_track_insert_data_point(g_config.px_hist, pr.start_time);
					}
					else {
						size_t pos = 0;
						while (pos < proto_sz) {
							rv = send(pr.fd_h->fd, (((uint8_t *)proto) + pos), proto_sz - pos, MSG_NOSIGNAL);
							if (rv > 0) {
								pos += rv;
							}
							else if (rv < 0) {
								if (errno != EWOULDBLOCK) {
									// Common message when a client aborts.
									cf_debug(AS_PROTO, "protocol proxy write fail: fd %d sz %d pos %d rv %d errno %d", pr.fd_h->fd, proto_sz, pos, rv, errno);
									shutdown(pr.fd_h->fd, SHUT_RDWR);
									as_proxy_set_stat_counters(-1);
									goto SendFin;
								}
								usleep(1); // yield
							}
							else {
								cf_info(AS_PROTO, "protocol write fail zero return: fd %d sz %d pos %d ", pr.fd_h->fd, proto_sz, pos);
								shutdown(pr.fd_h->fd, SHUT_RDWR);
								as_proxy_set_stat_counters(-1);
								goto SendFin;
							}
						}
						as_proxy_set_stat_counters(0);
SendFin:
						cf_hist_track_insert_data_point(g_config.px_hist, pr.start_time);

						// Return the fabric message or the direct file descriptor -
						// after write and complete.
						pr.fd_h->t_inprogress = false;
						AS_RELEASE_FILE_HANDLE(pr.fd_h);
						pr.fd_h = 0;
					}
					as_fabric_msg_put(pr.fab_msg);
					pr.fab_msg = 0;
				}
			}
			else {
				cf_debug(AS_PROXY, "proxy: received result but no transaction, tid %d", transaction_id);
				as_proxy_set_stat_counters(-1);
			}

			if (free_msg) {
				as_fabric_msg_put(m);
			}
		}
		break;

		case PROXY_OP_REDIRECT:
		{
			// Sometimes the destination we proxied a request to isn't able to
			// satisfy it (for example, their copy of the partition in question
			// might be desync).
			cf_node new_dst = 0;
			msg_get_uint64(m, PROXY_FIELD_REDIRECT, &new_dst);
			cf_detail(AS_PROXY, "proxy redirect message: transaction %d to node %"PRIx64, transaction_id, new_dst);

			// Look in the proxy retransmit hash for the tid.
			proxy_request *pr;
			pthread_mutex_t *pr_lock;
			int r = 0;
			if (0 != (r = shash_get_vlock(g_proxy_hash, &transaction_id, (void **)&pr, &pr_lock))) {
				cf_debug(AS_PROXY, "redirect: could not find transaction %d", transaction_id);
				as_fabric_msg_put(m);
				return -1;
			}

			if (g_config.self_node == new_dst) {

				// Although we don't know we're the final destination, undo the
				// proxy-nature and put back on the main queue. Dangerous, as it
				// leaves open the possibility of a looping message.

				cf_digest *key;
				size_t sz = 0;
				if (0 != msg_get_buf(pr->fab_msg, PROXY_FIELD_DIGEST, (byte **) &key, &sz, MSG_GET_DIRECT)) {
					cf_warning(AS_PROXY, "op_redirect: proxy msg function: no digest, problem");
					pthread_mutex_unlock(pr_lock);
					as_fabric_msg_put(m);
					return -1;
				}

				cl_msg *msgp;
				sz = 0;
				if (0 != msg_get_buf(pr->fab_msg, PROXY_FIELD_AS_PROTO, (byte **) &msgp, &sz, MSG_GET_COPY_MALLOC)) {
					cf_warning(AS_PROXY, "op_redirect: proxy msg function: no as proto, problem");
					pthread_mutex_unlock(pr_lock);
					as_fabric_msg_put(m);
					return -1;
				}

				// Put the as_msg on the normal queue for processing.
				// INIT_TR
				as_transaction tr;
				as_transaction_init(&tr, key, msgp);
				tr.start_time = pr->start_time; // start time
				tr.end_time   = pr->end_time;
				tr.proto_fd_h = pr->fd_h;
				tr.batch_shared = pr->batch_shared;
				tr.batch_index = pr->batch_index;

				MICROBENCHMARK_RESET();

				thr_tsvc_enqueue(&tr);

				as_fabric_msg_put(pr->fab_msg);
				shash_delete_lockfree(g_proxy_hash, &transaction_id);
			}
			else {
				// Change the destination, update the retransmit time.
				pr->dest = new_dst;
				pr->xmit_ms = cf_getms() + 1;

				// Send it.
				msg_incr_ref(pr->fab_msg);
				if (0 != (rv = as_fabric_send(pr->dest, pr->fab_msg, AS_FABRIC_PRIORITY_MEDIUM))) {
					cf_debug(AS_PROXY, "redirect: change destination: %"PRIx64" send error %d", pr->dest, rv);
					as_fabric_msg_put(pr->fab_msg);
				}
			}

			pthread_mutex_unlock(pr_lock);
		}
		as_fabric_msg_put(m);
		break;
		default:
			cf_debug(AS_PROXY, "proxy_msg_fn: received unknown, unsupported message %d from remote endpoint", op);
			msg_dump(m, "proxy received unknown msg");
			as_fabric_msg_put(m);
			break;
	} // end switch

	return 0;
} // end proxy_msg_fn()
Exemplo n.º 2
0
// Set of threads which talk to client over the connection for doing the needful
// processing. Note that once fd is assigned to a thread all the work on that fd
// is done by that thread. Fair fd usage is expected of the client. First thread
// is special - also does accept [listens for new connections]. It is the only
// thread which does it.
void *
thr_demarshal(void *arg)
{
	cf_socket_cfg *s, *ls;
	// Create my epoll fd, register in the global list.
	struct epoll_event ev;
	int nevents, i, n, epoll_fd;
	cf_clock last_fd_print = 0;

#if defined(USE_SYSTEMTAP)
	uint64_t nodeid = g_config.self_node;
#endif

	// Early stage aborts; these will cause faults in process scope.
	cf_assert(arg, AS_DEMARSHAL, CF_CRITICAL, "invalid argument");
	s = &g_config.socket;
	ls = &g_config.localhost_socket;

#ifdef USE_JEM
	int orig_arena;
	if (0 > (orig_arena = jem_get_arena())) {
		cf_crash(AS_DEMARSHAL, "Failed to get original arena for thr_demarshal()!");
	} else {
		cf_info(AS_DEMARSHAL, "Saved original JEMalloc arena #%d for thr_demarshal()", orig_arena);
	}
#endif

	// Figure out my thread index.
	pthread_t self = pthread_self();
	int thr_id;
	for (thr_id = 0; thr_id < MAX_DEMARSHAL_THREADS; thr_id++) {
		if (0 != pthread_equal(g_demarshal_args->dm_th[thr_id], self))
			break;
	}

	if (thr_id == MAX_DEMARSHAL_THREADS) {
		cf_debug(AS_FABRIC, "Demarshal thread could not figure own ID, bogus, exit, fu!");
		return(0);
	}

	// First thread accepts new connection at interface socket.
	if (thr_id == 0) {
		demarshal_file_handle_init();
		epoll_fd = epoll_create(EPOLL_SZ);
		if (epoll_fd == -1)
			cf_crash(AS_DEMARSHAL, "epoll_create(): %s", cf_strerror(errno));

		memset(&ev, 0, sizeof (ev));
		ev.events = EPOLLIN | EPOLLERR | EPOLLHUP;
		ev.data.fd = s->sock;
		if (0 > epoll_ctl(epoll_fd, EPOLL_CTL_ADD, s->sock, &ev))
			cf_crash(AS_DEMARSHAL, "epoll_ctl(): %s", cf_strerror(errno));
		cf_info(AS_DEMARSHAL, "Service started: socket %s:%d", s->addr, s->port);

		if (ls->sock) {
			ev.events = EPOLLIN | EPOLLERR | EPOLLHUP;
			ev.data.fd = ls->sock;
			if (0 > epoll_ctl(epoll_fd, EPOLL_CTL_ADD, ls->sock, &ev))
			  cf_crash(AS_DEMARSHAL, "epoll_ctl(): %s", cf_strerror(errno));
			cf_info(AS_DEMARSHAL, "Service also listening on localhost socket %s:%d", ls->addr, ls->port);
		}
	}
	else {
		epoll_fd = epoll_create(EPOLL_SZ);
		if (epoll_fd == -1)
			cf_crash(AS_DEMARSHAL, "epoll_create(): %s", cf_strerror(errno));
	}

	g_demarshal_args->epoll_fd[thr_id] = epoll_fd;
	cf_detail(AS_DEMARSHAL, "demarshal thread started: id %d", thr_id);

	int id_cntr = 0;

	// Demarshal transactions from the socket.
	for ( ; ; ) {
		struct epoll_event events[EPOLL_SZ];

		cf_detail(AS_DEMARSHAL, "calling epoll");

		nevents = epoll_wait(epoll_fd, events, EPOLL_SZ, -1);

		if (0 > nevents) {
			cf_debug(AS_DEMARSHAL, "epoll_wait() returned %d ; errno = %d (%s)", nevents, errno, cf_strerror(errno));
		}

		cf_detail(AS_DEMARSHAL, "epoll event received: nevents %d", nevents);

		uint64_t now_ns = cf_getns();
		uint64_t now_ms = now_ns / 1000000;

		// Iterate over all events.
		for (i = 0; i < nevents; i++) {
			if ((s->sock == events[i].data.fd) || (ls->sock == events[i].data.fd)) {
				// Accept new connections on the service socket.
				int csocket = -1;
				struct sockaddr_in caddr;
				socklen_t clen = sizeof(caddr);
				char cpaddr[64];

				if (-1 == (csocket = accept(events[i].data.fd, (struct sockaddr *)&caddr, &clen))) {
					// This means we're out of file descriptors - could be a SYN
					// flood attack or misbehaving client. Eventually we'd like
					// to make the reaper fairer, but for now we'll just have to
					// ignore the accept error and move on.
					if ((errno == EMFILE) || (errno == ENFILE)) {
						if (last_fd_print != (cf_getms() / 1000L)) {
							cf_info(AS_DEMARSHAL, " warning: hit OS file descript limit (EMFILE on accept), consider raising limit");
							last_fd_print = cf_getms() / 1000L;
						}
						continue;
					}
					cf_crash(AS_DEMARSHAL, "accept: %s (errno %d)", cf_strerror(errno), errno);
				}

				// Get the client IP address in string form.
				if (caddr.sin_family == AF_INET) {
					if (NULL == inet_ntop(AF_INET, &caddr.sin_addr.s_addr, (char *)cpaddr, sizeof(cpaddr))) {
						cf_crash(AS_DEMARSHAL, "inet_ntop(): %s (errno %d)", cf_strerror(errno), errno);
					}
				}
				else if (caddr.sin_family == AF_INET6) {
					struct sockaddr_in6* addr_in6 = (struct sockaddr_in6*)&caddr;

					if (NULL == inet_ntop(AF_INET6, &addr_in6->sin6_addr, (char *)cpaddr, sizeof(cpaddr))) {
						cf_crash(AS_DEMARSHAL, "inet_ntop(): %s (errno %d)", cf_strerror(errno), errno);
					}
				}
				else {
					cf_crash(AS_DEMARSHAL, "unknown address family %u", caddr.sin_family);
				}

				cf_detail(AS_DEMARSHAL, "new connection: %s (fd %d)", cpaddr, csocket);

				// Validate the limit of protocol connections we allow.
				uint32_t conns_open = g_config.proto_connections_opened - g_config.proto_connections_closed;
				if (conns_open > g_config.n_proto_fd_max) {
					if ((last_fd_print + 5000L) < cf_getms()) { // no more than 5 secs
						cf_warning(AS_DEMARSHAL, "dropping incoming client connection: hit limit %d connections", conns_open);
						last_fd_print = cf_getms();
					}
					shutdown(csocket, SHUT_RDWR);
					close(csocket);
					csocket = -1;
					continue;
				}

				// Set the socket to nonblocking.
				if (-1 == cf_socket_set_nonblocking(csocket)) {
					cf_info(AS_DEMARSHAL, "unable to set client socket to nonblocking mode");
					shutdown(csocket, SHUT_RDWR);
					close(csocket);
					csocket = -1;
					continue;
				}

				// Create as_file_handle and queue it up in epoll_fd for further
				// communication on one of the demarshal threads.
				as_file_handle *fd_h = cf_rc_alloc(sizeof(as_file_handle));
				if (!fd_h) {
					cf_crash(AS_DEMARSHAL, "malloc");
				}

				sprintf(fd_h->client, "%s:%d", cpaddr, ntohs(caddr.sin_port));
				fd_h->fd = csocket;

				fd_h->last_used = cf_getms();
				fd_h->reap_me = false;
				fd_h->trans_active = false;
				fd_h->proto = 0;
				fd_h->proto_unread = 0;
				fd_h->fh_info = 0;
				fd_h->security_filter = as_security_filter_create();

				// Insert into the global table so the reaper can manage it. Do
				// this before queueing it up for demarshal threads - once
				// EPOLL_CTL_ADD is done it's difficult to back out (if insert
				// into global table fails) because fd state could be anything.
				cf_rc_reserve(fd_h);

				pthread_mutex_lock(&g_file_handle_a_LOCK);

				int j;
				bool inserted = true;

				if (0 != cf_queue_pop(g_freeslot, &j, CF_QUEUE_NOWAIT)) {
					inserted = false;
				}
				else {
					g_file_handle_a[j] = fd_h;
				}

				pthread_mutex_unlock(&g_file_handle_a_LOCK);

				if (!inserted) {
					cf_info(AS_DEMARSHAL, "unable to add socket to file handle table");
					shutdown(csocket, SHUT_RDWR);
					close(csocket);
					csocket = -1;
					cf_rc_free(fd_h); // will free even with ref-count of 2
				}
				else {
					// Place the client socket in the event queue.
					memset(&ev, 0, sizeof(ev));
					ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP ;
					ev.data.ptr = fd_h;

					// Round-robin pick up demarshal thread epoll_fd and add
					// this new connection to epoll.
					int id;
					while (true) {
						id = (id_cntr++) % g_demarshal_args->num_threads;
						if (g_demarshal_args->epoll_fd[id] != 0) {
							break;
						}
					}

					fd_h->epoll_fd = g_demarshal_args->epoll_fd[id];

					if (0 > (n = epoll_ctl(fd_h->epoll_fd, EPOLL_CTL_ADD, csocket, &ev))) {
						cf_info(AS_DEMARSHAL, "unable to add socket to event queue of demarshal thread %d %d", id, g_demarshal_args->num_threads);
						pthread_mutex_lock(&g_file_handle_a_LOCK);
						fd_h->reap_me = true;
						as_release_file_handle(fd_h);
						fd_h = 0;
						pthread_mutex_unlock(&g_file_handle_a_LOCK);
					}
					else {
						cf_atomic_int_incr(&g_config.proto_connections_opened);
					}
				}
			}
			else {
				bool has_extra_ref   = false;
				as_file_handle *fd_h = events[i].data.ptr;
				if (fd_h == 0) {
					cf_info(AS_DEMARSHAL, "event with null handle, continuing");
					goto NextEvent;
				}

				cf_detail(AS_DEMARSHAL, "epoll connection event: fd %d, events 0x%x", fd_h->fd, events[i].events);

				// Process data on an existing connection: this might be more
				// activity on an already existing transaction, so we have some
				// state to manage.
				as_proto *proto_p = 0;
				int fd = fd_h->fd;

				if (events[i].events & (EPOLLRDHUP | EPOLLERR | EPOLLHUP)) {
					cf_detail(AS_DEMARSHAL, "proto socket: remote close: fd %d event %x", fd, events[i].events);
					// no longer in use: out of epoll etc
					goto NextEvent_FD_Cleanup;
				}

				if (fd_h->trans_active) {
					goto NextEvent;
				}

				// If pointer is NULL, then we need to create a transaction and
				// store it in the buffer.
				if (fd_h->proto == NULL) {
					as_proto proto;
					int sz;

					/* Get the number of available bytes */
					if (-1 == ioctl(fd, FIONREAD, &sz)) {
						cf_info(AS_DEMARSHAL, "unable to get number of available bytes");
						goto NextEvent_FD_Cleanup;
					}

					// If we don't have enough data to fill the message buffer,
					// just wait and we'll come back to this one. However, we'll
					// let messages with zero size through, since they are
					// likely errors. We don't cleanup the FD in this case since
					// we'll get more data on it.
					if (sz < sizeof(as_proto) && sz != 0) {
						goto NextEvent;
					}

					// Do a preliminary read of the header into a stack-
					// allocated structure, so that later on we can allocate the
					// entire message buffer.
					if (0 >= (n = cf_socket_recv(fd, &proto, sizeof(as_proto), MSG_WAITALL))) {
						cf_detail(AS_DEMARSHAL, "proto socket: read header fail: error: rv %d sz was %d errno %d", n, sz, errno);
						goto NextEvent_FD_Cleanup;
					}

					if (proto.version != PROTO_VERSION &&
							// For backward compatibility, allow version 0 with
							// security messages.
							! (proto.version == 0 && proto.type == PROTO_TYPE_SECURITY)) {
						cf_warning(AS_DEMARSHAL, "proto input from %s: unsupported proto version %u",
								fd_h->client, proto.version);
						goto NextEvent_FD_Cleanup;
					}

					// Swap the necessary elements of the as_proto.
					as_proto_swap(&proto);

					if (proto.sz > PROTO_SIZE_MAX) {
						cf_warning(AS_DEMARSHAL, "proto input from %s: msg greater than %d, likely request from non-Aerospike client, rejecting: sz %"PRIu64,
								fd_h->client, PROTO_SIZE_MAX, proto.sz);
						goto NextEvent_FD_Cleanup;
					}

#ifdef USE_JEM
					// Attempt to peek the namespace and set the JEMalloc arena accordingly.
					size_t peeked_data_sz = 0;
					size_t min_field_sz = sizeof(uint32_t) + sizeof(char);
					size_t min_as_msg_sz = sizeof(as_msg) + min_field_sz;
					size_t peekbuf_sz = 2048; // (Arbitrary "large enough" size for peeking the fields of "most" AS_MSGs.)
					uint8_t peekbuf[peekbuf_sz];
					if (PROTO_TYPE_AS_MSG == proto.type) {
						size_t offset = sizeof(as_msg);
						// Number of bytes to peek from the socket.
//						size_t peek_sz = peekbuf_sz;                 // Peak up to the size of the peek buffer.
						size_t peek_sz = MIN(proto.sz, peekbuf_sz);  // Peek only up to the minimum necessary number of bytes.
						if (!(peeked_data_sz = cf_socket_recv(fd, peekbuf, peek_sz, 0))) {
							// That's actually legitimate. The as_proto may have gone into one
							// packet, the as_msg into the next one, which we haven't yet received.
							// This just "never happened" without async.
							cf_detail(AS_DEMARSHAL, "could not peek the as_msg header, expected %zu byte(s)", peek_sz);
						}
						if (peeked_data_sz > min_as_msg_sz) {
//							cf_debug(AS_DEMARSHAL, "(Peeked %zu bytes.)", peeked_data_sz);
							if (peeked_data_sz > proto.sz) {
								cf_warning(AS_DEMARSHAL, "Received unexpected extra data from client %s socket %d when peeking as_proto!", fd_h->client, fd);
								log_as_proto_and_peeked_data(&proto, peekbuf, peeked_data_sz);
								goto NextEvent_FD_Cleanup;
							}

							if (((as_msg*)peekbuf)->info1 & AS_MSG_INFO1_BATCH) {
								jem_set_arena(orig_arena);
							} else {
								uint16_t n_fields = ntohs(((as_msg *) peekbuf)->n_fields), field_num = 0;
								bool found = false;
	//							cf_debug(AS_DEMARSHAL, "Found %d AS_MSG fields", n_fields);
								while (!found && (field_num < n_fields)) {
									as_msg_field *field = (as_msg_field *) (&peekbuf[offset]);
									uint32_t value_sz = ntohl(field->field_sz) - 1;
	//								cf_debug(AS_DEMARSHAL, "Field #%d offset: %lu", field_num, offset);
	//								cf_debug(AS_DEMARSHAL, "\tvalue_sz %u", value_sz);
	//								cf_debug(AS_DEMARSHAL, "\ttype %d", field->type);
									if (AS_MSG_FIELD_TYPE_NAMESPACE == field->type) {
										if (value_sz >= AS_ID_NAMESPACE_SZ) {
											cf_warning(AS_DEMARSHAL, "namespace too long (%u) in as_msg", value_sz);
											goto NextEvent_FD_Cleanup;
										}
										char ns[AS_ID_NAMESPACE_SZ];
										found = true;
										memcpy(ns, field->data, value_sz);
										ns[value_sz] = '\0';
	//									cf_debug(AS_DEMARSHAL, "Found ns \"%s\" in field #%d.", ns, field_num);
										jem_set_arena(as_namespace_get_jem_arena(ns));
									} else {
	//									cf_debug(AS_DEMARSHAL, "Message field %d is not namespace (type %d) ~~ Reading next field", field_num, field->type);
										field_num++;
										offset += sizeof(as_msg_field) + value_sz;
										if (offset >= peeked_data_sz) {
											break;
										}
									}
								}
								if (!found) {
									cf_warning(AS_DEMARSHAL, "Can't get namespace from AS_MSG (peeked %zu bytes) ~~ Using default thr_demarshal arena.", peeked_data_sz);
									jem_set_arena(orig_arena);
								}
							}
						} else {
							jem_set_arena(orig_arena);
						}
					} else {
						jem_set_arena(orig_arena);
					}
#endif

					// Allocate the complete message buffer.
					proto_p = cf_malloc(sizeof(as_proto) + proto.sz);

					cf_assert(proto_p, AS_DEMARSHAL, CF_CRITICAL, "allocation: %zu %s", (sizeof(as_proto) + proto.sz), cf_strerror(errno));
					memcpy(proto_p, &proto, sizeof(as_proto));

#ifdef USE_JEM
					// Jam in the peeked data.
					if (peeked_data_sz) {
						memcpy(proto_p->data, &peekbuf, peeked_data_sz);
					}
					fd_h->proto_unread = proto_p->sz - peeked_data_sz;
#else
					fd_h->proto_unread = proto_p->sz;
#endif
					fd_h->proto = (void *) proto_p;
				}
				else {
					proto_p = fd_h->proto;
				}

				if (fd_h->proto_unread > 0) {

					// Read the data.
					n = cf_socket_recv(fd, proto_p->data + (proto_p->sz - fd_h->proto_unread), fd_h->proto_unread, 0);
					if (0 >= n) {
						if (errno == EAGAIN) {
							continue;
						}
						cf_info(AS_DEMARSHAL, "receive socket: fail? n %d errno %d %s closing connection.", n, errno, cf_strerror(errno));
						goto NextEvent_FD_Cleanup;
					}

					// Decrement bytes-unread counter.
					cf_detail(AS_DEMARSHAL, "read fd %d (%d %d)", fd, n, fd_h->proto_unread);
					fd_h->proto_unread -= n;
				}

				// Check for a finished read.
				if (0 == fd_h->proto_unread) {

					// It's only really live if it's injecting a transaction.
					fd_h->last_used = now_ms;

					thr_demarshal_pause(fd_h); // pause reading while the transaction is in progress
					fd_h->proto = 0;
					fd_h->proto_unread = 0;

					// INIT_TR
					as_transaction tr;
					as_transaction_init(&tr, NULL, (cl_msg *)proto_p);

					cf_rc_reserve(fd_h);
					has_extra_ref   = true;
					tr.proto_fd_h   = fd_h;
					tr.start_time   = now_ns; // set transaction start time
					tr.preprocessed = false;

					if (! as_proto_is_valid_type(proto_p)) {
						cf_warning(AS_DEMARSHAL, "unsupported proto message type %u", proto_p->type);
						// We got a proto message type we don't recognize, so it
						// may not do any good to send back an as_msg error, but
						// it's the best we can do. At least we can keep the fd.
						as_transaction_demarshal_error(&tr, AS_PROTO_RESULT_FAIL_UNKNOWN);
						cf_atomic_int_incr(&g_config.proto_transactions);
						goto NextEvent;
					}

					if (g_config.microbenchmarks) {
						histogram_insert_data_point(g_config.demarshal_hist, now_ns);
						tr.microbenchmark_time = cf_getns();
					}

					// Check if it's compressed.
					if (tr.msgp->proto.type == PROTO_TYPE_AS_MSG_COMPRESSED) {
						// Decompress it - allocate buffer to hold decompressed
						// packet.
						uint8_t *decompressed_buf = NULL;
						size_t decompressed_buf_size = 0;
						int rv = 0;
						if ((rv = as_packet_decompression((uint8_t *)proto_p, &decompressed_buf, &decompressed_buf_size))) {
							cf_warning(AS_DEMARSHAL, "as_proto decompression failed! (rv %d)", rv);
							cf_warning_binary(AS_DEMARSHAL, proto_p, sizeof(as_proto) + proto_p->sz, CF_DISPLAY_HEX_SPACED, "compressed proto_p");
							as_transaction_demarshal_error(&tr, AS_PROTO_RESULT_FAIL_UNKNOWN);
							cf_atomic_int_incr(&g_config.proto_transactions);
							goto NextEvent;
						}
						// Count the packets.
						cf_atomic_int_add(&g_config.stat_compressed_pkts_received, 1);
						// Free the compressed packet since we'll be using the
						// decompressed packet from now on.
						cf_free(proto_p);
						proto_p = NULL;
						// Get original packet.
						tr.msgp = (cl_msg *)decompressed_buf;
						as_proto_swap(&(tr.msgp->proto));

						if (! as_proto_wrapped_is_valid(&tr.msgp->proto, decompressed_buf_size)) {
							cf_warning(AS_DEMARSHAL, "decompressed unusable proto: version %u, type %u, sz %lu [%lu]",
									tr.msgp->proto.version, tr.msgp->proto.type, tr.msgp->proto.sz, decompressed_buf_size);
							as_transaction_demarshal_error(&tr, AS_PROTO_RESULT_FAIL_UNKNOWN);
							cf_atomic_int_incr(&g_config.proto_transactions);
							goto NextEvent;
						}
					}

					// Security protocol transactions.
					if (tr.msgp->proto.type == PROTO_TYPE_SECURITY) {
						as_security_transact(&tr);
						cf_atomic_int_incr(&g_config.proto_transactions);
						goto NextEvent;
					}

					// Info protocol requests.
					if (tr.msgp->proto.type == PROTO_TYPE_INFO) {
						if (as_info(&tr)) {
							cf_warning(AS_DEMARSHAL, "Info request failed to be enqueued ~~ Freeing protocol buffer");
							goto NextEvent_FD_Cleanup;
						}
						cf_atomic_int_incr(&g_config.proto_transactions);
						goto NextEvent;
					}

					ASD_TRANS_DEMARSHAL(nodeid, (uint64_t) tr.msgp);

					// Fast path for batch requests.
					if (tr.msgp->msg.info1 & AS_MSG_INFO1_BATCH) {
						as_batch_queue_task(&tr);
						cf_atomic_int_incr(&g_config.proto_transactions);
						goto NextEvent;
					}

					// Either process the transaction directly in this thread,
					// or queue it for processing by another thread (tsvc/info).
					if (0 != thr_tsvc_process_or_enqueue(&tr)) {
						cf_warning(AS_DEMARSHAL, "Failed to queue transaction to the service thread");
						goto NextEvent_FD_Cleanup;
					}
					else {
						cf_atomic_int_incr(&g_config.proto_transactions);
					}
				}

				// Jump the proto message free & FD cleanup. If we get here, the
				// above operations went smoothly. The message free & FD cleanup
				// job is handled elsewhere as directed by
				// thr_tsvc_process_or_enqueue().
				goto NextEvent;

NextEvent_FD_Cleanup:
				// If we allocated memory for the incoming message, free it.
				if (proto_p) {
					cf_free(proto_p);
					fd_h->proto = 0;
				}
				// If fd has extra reference for transaction, release it.
				if (has_extra_ref) {
					cf_rc_release(fd_h);
				}
				// Remove the fd from the events list.
				if (epoll_ctl(epoll_fd, EPOLL_CTL_DEL, fd, 0) < 0) {
					cf_crash(AS_DEMARSHAL, "unable to remove socket FD %d from epoll instance FD %d: %d (%s)",
							fd, epoll_fd, errno, cf_strerror(errno));
				}
				pthread_mutex_lock(&g_file_handle_a_LOCK);
				fd_h->reap_me = true;
				as_release_file_handle(fd_h);
				fd_h = 0;
				pthread_mutex_unlock(&g_file_handle_a_LOCK);
NextEvent:
				;
			}

			// We should never be canceled externally, but just in case...
			pthread_testcancel();
		}
	}

	return NULL;
}