示例#1
0
/* gets information back from any of the nodes in the cluster */
int
citrusleaf_info_cluster(as_cluster *cluster, char *names, char **values_r, bool send_asis, bool check_bounds, int timeout_ms)
{
	if (timeout_ms == 0) {
		timeout_ms = 100; // milliseconds
	}
	
	uint64_t start = cf_getms();
	uint64_t end = start + timeout_ms;
	int ret = -1;

	as_nodes* nodes = as_nodes_reserve(cluster);
	
	for (uint32_t i = 0; i < nodes->size; i++) {
		as_node* node = nodes->array[i];
		struct sockaddr_in* sa_in = as_node_get_address(node);
		char* values = 0;
		
		if (citrusleaf_info_host_auth(cluster, sa_in, names, &values, (int)(end - cf_getms()), send_asis, check_bounds) == 0) {
			*values_r = values;
			ret = 0;
			break;
		}
		
		if (cf_getms() >= end) {
			ret = -2;
			break;
		}
	}
	as_nodes_release(nodes);
	return ret;
}
示例#2
0
bool
udf_timer_timedout(const as_timer *as_tt)
{
	time_tracker *tt = (time_tracker *)pthread_getspecific(timer_tlskey);
	if (!tt || !tt->end_time || !tt->udata) {
		return true;
	}
	bool timedout = (cf_getms() > tt->end_time(tt));
	if (timedout) {
		cf_debug(AS_UDF, "UDF Timed Out [%ld:%ld]", cf_getms(), tt->end_time(tt));
		return true;
	}
	return false;
}
示例#3
0
as_status
as_event_command_execute(as_event_command* cmd, as_error* err)
{
    ck_pr_inc_32(&cmd->cluster->async_pending);

    // Only do this after the above increment to avoid a race with as_cluster_destroy().
    if (!cmd->cluster->valid) {
        as_event_command_free(cmd);
        return as_error_set_message(err, AEROSPIKE_ERR_CLIENT, "Client shutting down");
    }

    // Use pointer comparison for performance.
    // If portability becomes an issue, use "pthread_equal(event_loop->thread, pthread_self())"
    // instead.
    if (cmd->event_loop->thread == pthread_self()) {
        // We are already in event loop thread, so start processing.
        as_event_command_begin(cmd);
    }
    else {
        if (cmd->timeout_ms) {
            // Store current time in first 8 bytes which is not used yet.
            *(uint64_t*)cmd = cf_getms();
        }

        // Send command through queue so it can be executed in event loop thread.
        if (! as_event_send(cmd)) {
            as_event_command_free(cmd);
            return as_error_set_message(err, AEROSPIKE_ERR_CLIENT, "Failed to queue command");
        }
    }
    return AEROSPIKE_OK;
}
示例#4
0
文件: aio.c 项目: sameerapadhye/act
/* Processing reads when they return from aio_read */
static void process_read(as_async_info_t *info)
{ 
	if(!g_running)
	{
		return;
	}
	cf_atomic_int_decr(&g_read_reqs_queued);
	uint64_t stop_time = cf_getms();
	fd_put(info->p_readreq.p_device, info->fd);
	
	if (stop_time != -1) 
	{
		histogram_insert_data_point(g_p_raw_read_histogram,
				safe_delta_ms(info->raw_start_time, stop_time));
		histogram_insert_data_point(g_p_read_histogram,
				safe_delta_ms(info->p_readreq.start_time, stop_time));
		histogram_insert_data_point(
				info->p_readreq.p_device->p_raw_read_histogram,
				safe_delta_ms(info->raw_start_time, stop_time));
	}
	if (g_use_valloc && info->p_buffer) 
	{
		free(info->p_buffer);
	}

	uintptr_t temp = (uintptr_t)info;
	cf_queue_push(async_info_queue, (void*)&temp);
}
示例#5
0
文件: aio.c 项目: sameerapadhye/act
//------------------------------------------------
// Do one large block write operation and report.
//
static void write_and_report_large_block(device* p_device) {
	salter* p_salter;

	if (g_num_write_buffers > 1) {
		p_salter = &g_salters[rand_32() % g_num_write_buffers];

		pthread_mutex_lock(&p_salter->lock);
		*(uint32_t*)p_salter->p_buffer = p_salter->stamp++;
	}
	else {
		p_salter = &g_salters[0];
	}

	uint64_t offset = random_large_block_offset(p_device);
	uint64_t start_time = cf_getms();
	uint64_t stop_time = write_to_device(p_device, offset,
			g_large_block_ops_bytes, p_salter->p_buffer);

	if (g_num_write_buffers > 1) {
		pthread_mutex_unlock(&p_salter->lock);
	}

	if (stop_time != -1) {
		histogram_insert_data_point(g_p_large_block_write_histogram,
				safe_delta_ms(start_time, stop_time));
	}
}
示例#6
0
// Process a batch request.
static void
batch_process_request(batch_transaction* btr)
{
	// Keep the reaper at bay.
	btr->fd_h->last_used = cf_getms();

	cf_buf_builder* bb = 0;
	batch_build_response(btr, &bb);

	int fd = btr->fd_h->fd;

	if (bb) {
		int brv = batch_send_header(fd, bb->used_sz);

		if (brv == 0) {
			brv = batch_send(fd, bb->buf, bb->used_sz, MSG_NOSIGNAL | MSG_MORE);

			if (brv == 0) {
				brv = batch_send_final(fd, 0);
			}
		}
		cf_buf_builder_free(bb);
	}
	else {
		cf_info(AS_BATCH, " batch request: returned no local responses");
		batch_send_final(fd, 0);
	}

	batch_transaction_done(btr);
}
示例#7
0
int
citrusleaf_info_cluster_foreach(
	as_cluster *cluster, const char *command, bool send_asis, bool check_bounds, int timeout_ms, void *udata,
	bool (*callback)(const as_node * node, const struct sockaddr_in * sa_in, const char *command, char *value, void *udata)
)
{
	//Usage Notes:
	//udata = memory allocated by caller, passed back to the caller callback function, ufn()
	//command = command string, memory allocated by caller, caller must free it, passed to server for execution
	//value = memory allocated by c-client for caller, caller must free it after using it.
	
	if (timeout_ms == 0) {
		timeout_ms = 100; // milliseconds
	}
	
	uint64_t start = cf_getms();
	uint64_t end = start + timeout_ms;
	int ret = 0;

	as_nodes* nodes = as_nodes_reserve(cluster);
	
	for (uint32_t i = 0; i < nodes->size; i++) {
		as_node* node = nodes->array[i];
		struct sockaddr_in* sa_in = as_node_get_address(node);
		char* value = 0;

		if (citrusleaf_info_host_auth(cluster, sa_in, (char *)command, &value, (int)(end - cf_getms()), send_asis, check_bounds) == 0) {
			bool status = callback(node, sa_in, command, value, udata);
			
			if (value) {
				free(value);
			}
			
			if(! status) {
				ret = -1;
				break;
			}
		}
		
		if (cf_getms() >= end) {
			ret = -2;
			break;
		}
	}
	as_nodes_release(nodes);
	return ret;
}
示例#8
0
uint64_t
udf_timer_timeslice(const as_timer *as_tt)
{
	time_tracker *tt = (time_tracker *)pthread_getspecific(timer_tlskey);
	if (!tt || !tt->end_time || !tt->udata) {
		return true;
	}
	uint64_t timeslice = tt->end_time(tt) - cf_getms();
	return (timeslice > 0) ? timeslice : 1;
}
示例#9
0
文件: aio.c 项目: sameerapadhye/act
//------------------------------------------------
// Do one large block read operation and report.
//
static void read_and_report_large_block(device* p_device) {
	uint64_t offset = random_large_block_offset(p_device);
	uint64_t start_time = cf_getms();
	uint64_t stop_time = read_from_device(p_device, offset,
			g_large_block_ops_bytes, p_device->p_large_block_read_buffer);

	if (stop_time != -1) {
		histogram_insert_data_point(g_p_large_block_read_histogram,
				safe_delta_ms(start_time, stop_time));
	}
}
示例#10
0
static int
wait_socket(as_socket_fd fd, uint32_t socket_timeout, uint64_t deadline, bool read)
{
	as_poll poll;
	as_poll_init(&poll, fd);

	uint32_t timeout;
	int rv;

	while (true) {
		if (deadline > 0) {
			uint64_t now = cf_getms();

			if (now >= deadline) {
				rv = 1;  // timeout
				break;
			}

			timeout = (uint32_t)(deadline - now);

			if (socket_timeout > 0 && socket_timeout < timeout) {
				timeout = socket_timeout;
			}
		}
		else {
			timeout = socket_timeout;
		}

		rv = as_poll_socket(&poll, fd, timeout, read);

		if (rv > 0) {
			rv = 0;  // success
			break;
		}

		if (rv < 0) {
			break;  // error
		}
		// rv == 0 timeout.  continue in case timed out before real timeout.
	}

	as_poll_destroy(&poll);
	return rv;
}
示例#11
0
static as_status
as_admin_read_list(aerospike* as, as_error* err, const as_policy_admin* policy, uint8_t* command, uint8_t* end, as_admin_parse_fn parse_fn, as_vector* list)
{
	int timeout_ms = (policy)? policy->timeout : as->config.policies.admin.timeout;
	if (timeout_ms <= 0) {
		timeout_ms = DEFAULT_TIMEOUT;
	}
	uint64_t deadline_ms = cf_getms() + timeout_ms;
	as_node* node = as_node_get_random(as->cluster);
	
	if (! node) {
		return as_error_set_message(err, AEROSPIKE_ERR_CLIENT, "Failed to find server node.");
	}
	
	int fd;
	as_status status = as_node_get_connection(err, node, &fd);
	
	if (status) {
		as_node_release(node);
		return status;
	}
	
	status = as_admin_send(err, fd, command, end, deadline_ms);
	
	if (status) {
		as_close(fd);
		as_node_release(node);
		return status;
	}
	
	status = as_admin_read_blocks(err, fd, deadline_ms, parse_fn, list);
	
	if (status) {
		as_close(fd);
		as_node_release(node);
		return status;
	}

	as_node_put_connection(node, fd);
	as_node_release(node);
	return status;
}
示例#12
0
文件: aio.c 项目: sameerapadhye/act
//------------------------------------------------
// Do one device write operation.
//
static uint64_t write_to_device(device* p_device, uint64_t offset,
		uint32_t size, uint8_t* p_buffer) {
	int fd = fd_get(p_device);

	if (fd == -1) {
		return -1;
	}

	if (lseek(fd, offset, SEEK_SET) != offset ||
			write(fd, p_buffer, size) != (ssize_t)size) {
		close(fd);
		fprintf(stdout, "ERROR: seek & write\n");
		return -1;
	}

	uint64_t stop_ms = cf_getms();

	fd_put(p_device, fd);

	return stop_ms;
}
void
repl_write_reset_rw(rw_request* rw, as_transaction* tr, repl_write_done_cb cb)
{
	// Reset rw->from.any which was set null in tr setup. (And note that
	// tr->from.any will be null here in respond-on-master-complete mode.)
	rw->from.any = tr->from.any;

	// Needed for response to origin.
	rw->generation = tr->generation;
	rw->void_time = tr->void_time;

	rw->repl_write_cb = cb;

	// TODO - is this better than not resetting? Note - xmit_ms not volatile.
	rw->xmit_ms = cf_getms() + g_config.transaction_retry_ms;
	rw->retry_interval_ms = g_config.transaction_retry_ms;

	for (int i = 0; i < rw->n_dest_nodes; i++) {
		rw->dest_complete[i] = false;
	}
}
示例#14
0
文件: aio.c 项目: sameerapadhye/act
//------------------------------------------------
// Runs in every device large-block write thread,
// executes large-block writes at a constant rate.
//
static void* run_large_block_writes(void* pv_device) {
	device* p_device = (device*)pv_device;
	uint64_t count = 0;

	while (g_running) {
		write_and_report_large_block(p_device);

		count++;

		int sleep_ms = (int)
			(((count * 1000 * g_num_devices) / g_large_block_ops_per_sec) -
			 (cf_getms() - g_run_start_ms));

		if (sleep_ms > 0) {
			usleep((uint32_t)sleep_ms * 1000);
		}

	}

	return (0);
}
示例#15
0
static int
as_read_users(aerospike* as, const as_policy_admin* policy, uint8_t* buffer, uint8_t* end, as_vector* /*<as_user_roles*>*/ users)
{
	int timeout_ms = (policy)? policy->timeout : as->config.policies.admin.timeout;
	if (timeout_ms <= 0) {
		timeout_ms = DEFAULT_TIMEOUT;
	}
	uint64_t deadline_ms = cf_getms() + timeout_ms;
	as_node* node = as_node_get_random(as->cluster);
	
	if (! node) {
		return CITRUSLEAF_FAIL_CLIENT;
	}
	
	int fd;
	int status = as_node_get_connection(node, &fd);
	
	if (status) {
		as_node_release(node);
		return status;
	}
	
	if (as_send(fd, buffer, end, deadline_ms, timeout_ms)) {
		cf_close(fd);
		as_node_release(node);
		return CITRUSLEAF_FAIL_TIMEOUT;
	}
	
	status = as_read_user_blocks(fd, buffer, deadline_ms, timeout_ms, users);
	
	if (status >= 0) {
		as_node_put_connection(node, fd);
	}
	else {
		cf_close(fd);
	}
	as_node_release(node);
	return status;
}
void
repl_write_setup_rw(rw_request* rw, as_transaction* tr,
		repl_write_done_cb repl_write_cb, timeout_done_cb timeout_cb)
{
	rw->msgp = tr->msgp;
	tr->msgp = NULL;

	rw->msg_fields = tr->msg_fields;
	rw->origin = tr->origin;
	rw->from_flags = tr->from_flags;

	rw->from.any = tr->from.any;
	rw->from_data.any = tr->from_data.any;
	tr->from.any = NULL;

	rw->start_time = tr->start_time;
	rw->benchmark_time = tr->benchmark_time;

	as_partition_reservation_copy(&rw->rsv, &tr->rsv);
	// Hereafter, rw_request must release reservation - happens in destructor.

	rw->end_time = tr->end_time;
	rw->generation = tr->generation;
	rw->void_time = tr->void_time;

	rw->repl_write_cb = repl_write_cb;
	rw->timeout_cb = timeout_cb;

	rw->xmit_ms = cf_getms() + g_config.transaction_retry_ms;
	rw->retry_interval_ms = g_config.transaction_retry_ms;

	for (int i = 0; i < rw->n_dest_nodes; i++) {
		rw->dest_complete[i] = false;
	}

	// Allow retransmit thread to destroy rw_request as soon as we unlock.
	rw->is_set_up = true;
}
示例#17
0
static int
as_execute(aerospike* as, const as_policy_admin* policy, uint8_t* buffer, uint8_t* end)
{
	int timeout_ms = (policy)? policy->timeout : as->config.policies.admin.timeout;
	if (timeout_ms <= 0) {
		timeout_ms = DEFAULT_TIMEOUT;
	}
	uint64_t deadline_ms = cf_getms() + timeout_ms;
	as_node* node = as_node_get_random(as->cluster);
	
	if (! node) {
		return CITRUSLEAF_FAIL_CLIENT;
	}
	
	int fd;
	int status = as_node_get_connection(node, &fd);
	
	if (status) {
		as_node_release(node);
		return status;
	}

	if (as_send(fd, buffer, end, deadline_ms, timeout_ms)) {
		cf_close(fd);
		as_node_release(node);
		return CITRUSLEAF_FAIL_TIMEOUT;
	}
	
	if (cf_socket_read_timeout(fd, buffer, HEADER_SIZE, deadline_ms, timeout_ms)) {
		cf_close(fd);
		as_node_release(node);
		return CITRUSLEAF_FAIL_TIMEOUT;
	}
	
	as_node_put_connection(node, fd);
	as_node_release(node);
	return buffer[RESULT_CODE];
}
示例#18
0
int
as_authenticate(int fd, const char* user, const char* credential, int timeout_ms)
{
	uint8_t buffer[STACK_BUF_SZ];
	uint8_t* p = buffer + 8;

	p = write_header(p, AUTHENTICATE, 2);
	p = write_field_string(p, USER, user);
	p = write_field_string(p, CREDENTIAL, credential);
	
	if (timeout_ms == 0) {
		timeout_ms = DEFAULT_TIMEOUT;
	}
	uint64_t deadline_ms = cf_getms() + timeout_ms;
	
	if (as_send(fd, buffer, p, deadline_ms, timeout_ms)) {
		return CITRUSLEAF_FAIL_TIMEOUT;
	}

	if (cf_socket_read_timeout(fd, buffer, HEADER_SIZE, deadline_ms, timeout_ms)) {
		return CITRUSLEAF_FAIL_TIMEOUT;
	}
	return buffer[RESULT_CODE];
}
示例#19
0
// Make a callback for a specified number of elements in the tree, from outside
// the tree lock.
void
as_index_reduce_partial(as_index_tree *tree, uint32_t sample_count,
		as_index_reduce_fn cb, void *udata)
{
	pthread_mutex_lock(&tree->reduce_lock);

	// For full reduce, get the number of elements inside the tree lock.
	if (sample_count == AS_REDUCE_ALL) {
		sample_count = tree->elements;
	}

	if (sample_count == 0) {
		pthread_mutex_unlock(&tree->reduce_lock);
		return;
	}

	size_t sz = sizeof(as_index_ph_array) +
			(sizeof(as_index_ph) * sample_count);
	as_index_ph_array *v_a;
	uint8_t buf[64 * 1024];

	if (sz > 64 * 1024) {
		v_a = cf_malloc(sz);

		if (! v_a) {
			pthread_mutex_unlock(&tree->reduce_lock);
			return;
		}
	}
	else {
		v_a = (as_index_ph_array*)buf;
	}

	v_a->alloc_sz = sample_count;
	v_a->pos = 0;

	uint64_t start_ms = cf_getms();

	// Recursively, fetch all the value pointers into this array, so we can make
	// all the callbacks outside the big lock.
	if (tree->root->left_h != tree->sentinel_h) {
		as_index_reduce_traverse(tree, tree->root->left_h, tree->sentinel_h,
				v_a);
	}

	cf_debug(AS_INDEX, "as_index_reduce_traverse took %"PRIu64" ms",
			cf_getms() - start_ms);

	pthread_mutex_unlock(&tree->reduce_lock);

	for (uint32_t i = 0; i < v_a->pos; i++) {
		as_index_ref r_ref;

		r_ref.skip_lock = false;
		r_ref.r = v_a->indexes[i].r;
		r_ref.r_h = v_a->indexes[i].r_h;

		olock_vlock(g_config.record_locks, &r_ref.r->key, &r_ref.olock);
		cf_atomic_int_incr(&g_config.global_record_lock_count);

		// Callback MUST call as_record_done() to unlock and release record.
		cb(&r_ref, udata);
	}

	if (v_a != (as_index_ph_array*)buf) {
		cf_free(v_a);
	}
}
示例#20
0
// Incoming messages start here.
// - Could get a request that we need to service.
// - Could get a response to one of our requests - need to find the request and
//   send the real response to the remote end.
int
proxy_msg_fn(cf_node id, msg *m, void *udata)
{
	int rv;

	if (cf_rc_count((void*)m) == 0) {
		cf_debug(AS_PROXY, " proxy_msg_fn was given a refcount 0 message! Someone has been naugty %p", m);
		return -1;
	}

	uint32_t op = 99999;
	msg_get_uint32(m, PROXY_FIELD_OP, &op);
	uint32_t transaction_id = 0;
	msg_get_uint32(m, PROXY_FIELD_TID, &transaction_id);

	cf_detail(AS_PROXY, "received proxy message: tid %d type %d from %"PRIx64, transaction_id, op, id);

	switch (op) {
		case PROXY_OP_REQUEST:
		{
			cf_atomic_int_incr(&g_config.proxy_action);

#ifdef DEBUG
			cf_debug(AS_PROXY, "Proxy_msg: received request");
#ifdef DEBUG_VERBOSE
			msg_dump(m, "incoming proxy msg");
#endif
#endif
			cf_digest *key;
			size_t sz = 0;
			if (0 != msg_get_buf(m, PROXY_FIELD_DIGEST, (byte **) &key, &sz, MSG_GET_DIRECT)) {
				cf_info(AS_PROXY, "proxy msg function: no digest, problem");
				as_fabric_msg_put(m);
				return 0;
			}
			cl_msg *msgp;
			size_t as_msg_sz = 0;
			if (0 != msg_get_buf(m, PROXY_FIELD_AS_PROTO, (byte **) &msgp, &as_msg_sz, MSG_GET_COPY_MALLOC)) {
				cf_info(AS_PROXY, "proxy msg function: no as msg, problem");
				as_fabric_msg_put(m);
				return 0;
			}

			uint64_t cluster_key = 0;
			if (0 != msg_get_uint64(m, PROXY_FIELD_CLUSTER_KEY, &cluster_key)) {
				cf_info(AS_PROXY, "proxy msg function: no cluster key, problem");
				as_fabric_msg_put(m);
				return 0;
			}

			// This is allowed to fail - this is a new field, and gets defaulted
			// to 0 if it doesn't exist.
			uint32_t timeout_ms = 0;
			msg_get_uint32(m, PROXY_FIELD_TIMEOUT_MS, &timeout_ms);
//			cf_info(AS_PROXY, "proxy msg: received timeout_ms of %d",timeout_ms);

			// Put the as_msg on the normal queue for processing.
			// INIT_TR
			as_transaction tr;
			as_transaction_init(&tr, key, msgp);
			tr.incoming_cluster_key = cluster_key;
			tr.end_time             = (timeout_ms != 0) ? ((uint64_t)timeout_ms * 1000000) + tr.start_time : 0;
			tr.proxy_node           = id;
			tr.proxy_msg            = m;

			// Check here if this is shipped op.
			uint32_t info = 0;
			msg_get_uint32(m, PROXY_FIELD_INFO, &info);
			if (info & PROXY_INFO_SHIPPED_OP) {
				tr.flag |= AS_TRANSACTION_FLAG_SHIPPED_OP;
				cf_detail_digest(AS_PROXY, &tr.keyd, "SHIPPED_OP WINNER Operation Received");
			} else {
				cf_detail_digest(AS_PROXY, &tr.keyd, "Received Proxy Request digest tid(%d)", tr.trid);
			}

			MICROBENCHMARK_RESET();

			thr_tsvc_enqueue(&tr);
		}
		break;

		case PROXY_OP_RESPONSE:
		{
#ifdef DEBUG
			// Got the response from the actual endpoint.
			cf_debug(AS_PROXY, " proxy: received response! tid %d node %"PRIx64, transaction_id, id);
#ifdef DEBUG_VERBOSE
			msg_dump(m, "incoming proxy response");
#endif
#endif

			// Look up the element.
			proxy_request pr;
			bool free_msg = true;
			if (SHASH_OK == shash_get_and_delete(g_proxy_hash, &transaction_id, &pr)) {
				// Found the element (sometimes we get two acks so it's OK for
				// an ack to not find the transaction).

				if (pr.wr) {
					as_proxy_shipop_response_hdlr(m, &pr, &free_msg);
				} else {
					as_proto *proto;
					size_t proto_sz;
					if (0 != msg_get_buf(m, PROXY_FIELD_AS_PROTO, (byte **) &proto, &proto_sz, MSG_GET_DIRECT)) {
						cf_info(AS_PROXY, "msg get buf failed!");
					}

#ifdef DEBUG_VERBOSE
					cf_debug(AS_PROXY, "proxy: sending proto response: ptr %p sz %"PRIu64" %d", proto, proto_sz, pr.fd);
					for (size_t _i = 0; _i < proto_sz; _i++) {
						fprintf(stderr, " %x", ((byte *)proto)[_i]);
						if (_i % 16 == 15) {
							fprintf(stderr, "\n");
						}
					}
#endif

#ifdef EXTRA_CHECKS
					as_proto proto_copy = *proto;
					as_proto_swap(&proto_copy);
					if (proto_copy.sz + 8 != proto_sz) {
						cf_info(AS_PROXY, "BONE BONE BONE!!!");
						cf_info(AS_PROXY, "proto sz: %"PRIu64" sz %u", (uint64_t) proto_copy.sz, proto_sz);
					}
#endif

					// Write to the file descriptor.
					cf_detail(AS_PROXY, "direct write fd %d", pr.fd_h->fd);
					cf_assert(pr.fd_h->fd, AS_PROXY, CF_WARNING, "attempted write to fd 0");

					if (pr.batch_shared) {
						cf_digest* digest;
						size_t digest_sz = 0;

						if (msg_get_buf(pr.fab_msg, PROXY_FIELD_DIGEST, (byte **)&digest, &digest_sz, MSG_GET_DIRECT) == 0) {
							as_batch_add_proxy_result(pr.batch_shared, pr.batch_index, digest, (cl_msg*)proto, proto_sz);
							as_proxy_set_stat_counters(0);
						}
						else {
							cf_warning(AS_PROXY, "Failed to find batch proxy digest %u", transaction_id);
							as_batch_add_error(pr.batch_shared, pr.batch_index, AS_PROTO_RESULT_FAIL_UNKNOWN);
							as_proxy_set_stat_counters(-1);
						}
						cf_hist_track_insert_data_point(g_config.px_hist, pr.start_time);
					}
					else {
						size_t pos = 0;
						while (pos < proto_sz) {
							rv = send(pr.fd_h->fd, (((uint8_t *)proto) + pos), proto_sz - pos, MSG_NOSIGNAL);
							if (rv > 0) {
								pos += rv;
							}
							else if (rv < 0) {
								if (errno != EWOULDBLOCK) {
									// Common message when a client aborts.
									cf_debug(AS_PROTO, "protocol proxy write fail: fd %d sz %d pos %d rv %d errno %d", pr.fd_h->fd, proto_sz, pos, rv, errno);
									shutdown(pr.fd_h->fd, SHUT_RDWR);
									as_proxy_set_stat_counters(-1);
									goto SendFin;
								}
								usleep(1); // yield
							}
							else {
								cf_info(AS_PROTO, "protocol write fail zero return: fd %d sz %d pos %d ", pr.fd_h->fd, proto_sz, pos);
								shutdown(pr.fd_h->fd, SHUT_RDWR);
								as_proxy_set_stat_counters(-1);
								goto SendFin;
							}
						}
						as_proxy_set_stat_counters(0);
SendFin:
						cf_hist_track_insert_data_point(g_config.px_hist, pr.start_time);

						// Return the fabric message or the direct file descriptor -
						// after write and complete.
						pr.fd_h->t_inprogress = false;
						AS_RELEASE_FILE_HANDLE(pr.fd_h);
						pr.fd_h = 0;
					}
					as_fabric_msg_put(pr.fab_msg);
					pr.fab_msg = 0;
				}
			}
			else {
				cf_debug(AS_PROXY, "proxy: received result but no transaction, tid %d", transaction_id);
				as_proxy_set_stat_counters(-1);
			}

			if (free_msg) {
				as_fabric_msg_put(m);
			}
		}
		break;

		case PROXY_OP_REDIRECT:
		{
			// Sometimes the destination we proxied a request to isn't able to
			// satisfy it (for example, their copy of the partition in question
			// might be desync).
			cf_node new_dst = 0;
			msg_get_uint64(m, PROXY_FIELD_REDIRECT, &new_dst);
			cf_detail(AS_PROXY, "proxy redirect message: transaction %d to node %"PRIx64, transaction_id, new_dst);

			// Look in the proxy retransmit hash for the tid.
			proxy_request *pr;
			pthread_mutex_t *pr_lock;
			int r = 0;
			if (0 != (r = shash_get_vlock(g_proxy_hash, &transaction_id, (void **)&pr, &pr_lock))) {
				cf_debug(AS_PROXY, "redirect: could not find transaction %d", transaction_id);
				as_fabric_msg_put(m);
				return -1;
			}

			if (g_config.self_node == new_dst) {

				// Although we don't know we're the final destination, undo the
				// proxy-nature and put back on the main queue. Dangerous, as it
				// leaves open the possibility of a looping message.

				cf_digest *key;
				size_t sz = 0;
				if (0 != msg_get_buf(pr->fab_msg, PROXY_FIELD_DIGEST, (byte **) &key, &sz, MSG_GET_DIRECT)) {
					cf_warning(AS_PROXY, "op_redirect: proxy msg function: no digest, problem");
					pthread_mutex_unlock(pr_lock);
					as_fabric_msg_put(m);
					return -1;
				}

				cl_msg *msgp;
				sz = 0;
				if (0 != msg_get_buf(pr->fab_msg, PROXY_FIELD_AS_PROTO, (byte **) &msgp, &sz, MSG_GET_COPY_MALLOC)) {
					cf_warning(AS_PROXY, "op_redirect: proxy msg function: no as proto, problem");
					pthread_mutex_unlock(pr_lock);
					as_fabric_msg_put(m);
					return -1;
				}

				// Put the as_msg on the normal queue for processing.
				// INIT_TR
				as_transaction tr;
				as_transaction_init(&tr, key, msgp);
				tr.start_time = pr->start_time; // start time
				tr.end_time   = pr->end_time;
				tr.proto_fd_h = pr->fd_h;
				tr.batch_shared = pr->batch_shared;
				tr.batch_index = pr->batch_index;

				MICROBENCHMARK_RESET();

				thr_tsvc_enqueue(&tr);

				as_fabric_msg_put(pr->fab_msg);
				shash_delete_lockfree(g_proxy_hash, &transaction_id);
			}
			else {
				// Change the destination, update the retransmit time.
				pr->dest = new_dst;
				pr->xmit_ms = cf_getms() + 1;

				// Send it.
				msg_incr_ref(pr->fab_msg);
				if (0 != (rv = as_fabric_send(pr->dest, pr->fab_msg, AS_FABRIC_PRIORITY_MEDIUM))) {
					cf_debug(AS_PROXY, "redirect: change destination: %"PRIx64" send error %d", pr->dest, rv);
					as_fabric_msg_put(pr->fab_msg);
				}
			}

			pthread_mutex_unlock(pr_lock);
		}
		as_fabric_msg_put(m);
		break;
		default:
			cf_debug(AS_PROXY, "proxy_msg_fn: received unknown, unsupported message %d from remote endpoint", op);
			msg_dump(m, "proxy received unknown msg");
			as_fabric_msg_put(m);
			break;
	} // end switch

	return 0;
} // end proxy_msg_fn()
示例#21
0
int
as_proxy_shipop(cf_node dst, write_request *wr)
{
	as_partition_id pid = as_partition_getid(wr->keyd);

	if (dst == 0) {
		cf_crash(AS_PROXY, "the destination should never be zero");
	}

	// Create a fabric message, fill it out.
	msg *m = as_fabric_msg_get(M_TYPE_PROXY);
	if (!m)	{
		return -1;
	}

	uint32_t tid = cf_atomic32_incr(&g_proxy_tid);

	msg_set_uint32(m, PROXY_FIELD_OP, PROXY_OP_REQUEST);
	msg_set_uint32(m, PROXY_FIELD_TID, tid);
	msg_set_buf(m, PROXY_FIELD_DIGEST, (void *) &wr->keyd, sizeof(cf_digest), MSG_SET_COPY);
	msg_set_buf(m, PROXY_FIELD_AS_PROTO, (void *) wr->msgp, as_proto_size_get(&wr->msgp->proto), MSG_SET_HANDOFF_MALLOC);
	msg_set_uint64(m, PROXY_FIELD_CLUSTER_KEY, as_paxos_get_cluster_key());
	msg_set_uint32(m, PROXY_FIELD_TIMEOUT_MS, wr->msgp->msg.transaction_ttl);
	wr->msgp = 0;

	// If it is shipped op.
	uint32_t info = 0;
	info |= PROXY_INFO_SHIPPED_OP;
	msg_set_uint32(m, PROXY_FIELD_INFO, info);

	cf_detail_digest(AS_PROXY, &wr->keyd, "SHIPPED_OP %s->WINNER msg %p Proxy Sent to %"PRIx64" %p tid(%d)",
			wr->proxy_msg ? "NONORIG" : "ORIG", m, dst, wr, tid);

	// Fill out a retransmit structure, insert into the retransmit hash.
	msg_incr_ref(m);
	proxy_request pr;
	pr.start_time  = wr->start_time;
	pr.end_time    = (wr->end_time != 0) ? wr->end_time : pr.start_time + g_config.transaction_max_ns;
	cf_rc_reserve(wr);
	pr.wr          = wr;
	pr.fab_msg     = m;
	pr.xmit_ms     = cf_getms() + g_config.transaction_retry_ms;
	pr.retry_interval_ms = g_config.transaction_retry_ms;
	pr.dest        = dst;
	pr.pid         = pid;
	pr.fd_h        = NULL;
	pr.batch_shared = NULL;
	pr.batch_index = 0;

	if (0 != shash_put(g_proxy_hash, &tid, &pr)) {
		cf_info(AS_PROXY, " shash_put failed, need cleanup code");
		return -1;
	}

	// Send to the remote node.
	int rv = as_fabric_send(dst, m, AS_FABRIC_PRIORITY_MEDIUM);
	if (rv != 0) {
		cf_detail(AS_PROXY, "SHIPPED_OP ORIG [Digest %"PRIx64"] Failed with %d", *(uint64_t *)&wr->keyd, rv);
		as_fabric_msg_put(m);
	}

	wr->shipped_op_initiator = true;
	cf_atomic_int_incr(&g_config.ldt_proxy_initiate);

	return 0;
}
示例#22
0
// Make a request to another node.
//
// Note: there's a cheat here. 'as_msg' is used in a raw form, and includes
// structured data (version - type - nfields - sz ...) which should be made more
// wire-protocol-friendly.
int
as_proxy_divert(cf_node dst, as_transaction *tr, as_namespace *ns, uint64_t cluster_key)
{
	cf_detail(AS_PROXY, "proxy divert");

	cf_atomic_int_incr(&g_config.stat_proxy_reqs);
	if (tr->msgp && (tr->msgp->msg.info1 & AS_MSG_INFO1_XDR)) {
		cf_atomic_int_incr(&g_config.stat_proxy_reqs_xdr);
	}
	as_partition_id pid = as_partition_getid(tr->keyd);

	if (dst == 0) {
		// Get the list of replicas.
		dst = as_partition_getreplica_read(ns, pid);
	}

	// Create a fabric message, fill it out.
	msg *m = as_fabric_msg_get(M_TYPE_PROXY);
	if (!m)	{
		return -1;
	}

	uint32_t tid = cf_atomic32_incr(&g_proxy_tid);

	msg_set_uint32(m, PROXY_FIELD_OP, PROXY_OP_REQUEST);
	msg_set_uint32(m, PROXY_FIELD_TID, tid);
	msg_set_buf(m, PROXY_FIELD_DIGEST, (void *) &tr->keyd, sizeof(cf_digest), MSG_SET_COPY);
	msg_set_type msettype = tr->batch_shared ? MSG_SET_COPY : MSG_SET_HANDOFF_MALLOC;
	msg_set_buf(m, PROXY_FIELD_AS_PROTO, (void *) tr->msgp, as_proto_size_get(&tr->msgp->proto), msettype);
	msg_set_uint64(m, PROXY_FIELD_CLUSTER_KEY, cluster_key);
	msg_set_uint32(m, PROXY_FIELD_TIMEOUT_MS, tr->msgp->msg.transaction_ttl);

	tr->msgp = 0;

	cf_debug_digest(AS_PROXY, &tr->keyd, "proxy_divert: fab_msg %p dst %"PRIx64, m, dst);

	// Fill out a retransmit structure, insert into the retransmit hash.
	msg_incr_ref(m);
	proxy_request pr;
	pr.start_time = tr->start_time;
	pr.end_time = (tr->end_time != 0) ? tr->end_time : pr.start_time + g_config.transaction_max_ns;
	pr.fd_h = tr->proto_fd_h;
	tr->proto_fd_h = 0;
	pr.fab_msg = m;
	pr.xmit_ms = cf_getms() + g_config.transaction_retry_ms;
	pr.retry_interval_ms = g_config.transaction_retry_ms;
	pr.dest = dst;
	pr.pid = pid;
	pr.ns = ns;
	pr.wr = NULL;
	pr.batch_shared = tr->batch_shared;
	pr.batch_index = tr->batch_index;

	if (0 != shash_put(g_proxy_hash, &tid, &pr)) {
		cf_debug(AS_PROXY, " shash_put failed, need cleanup code");
		return -1;
	}

	// Send to the remote node.
	int rv = as_fabric_send(dst, m, AS_FABRIC_PRIORITY_MEDIUM);
	if (rv != 0) {
		cf_debug(AS_PROXY, "as_proxy_divert: returned error %d", rv);
		as_fabric_msg_put(m);
	}

	cf_atomic_int_incr(&g_config.proxy_initiate);

	return 0;
}
示例#23
0
//Same as do_the_full_monte, but only till the command is sent to the node.
//Most of the code is duplicated. Bad.
int
cl_do_async_monte(cl_cluster *asc, int info1, int info2, const char *ns, const char *set, const cl_object *key,
			const cf_digest *digest, cl_bin **values, cl_operator operator, cl_operation **operations, 
			int *n_values, uint32_t *cl_gen, const cl_write_parameters *cl_w_p, uint64_t *trid, void *udata)

{
	cl_async_work	*workitem = NULL;

	uint8_t		wr_stack_buf[STACK_BUF_SZ];
	uint8_t		*wr_buf = wr_stack_buf;
	size_t		wr_buf_sz = sizeof(wr_stack_buf);
	int        	progress_timeout_ms;
	uint64_t 	deadline_ms;
	uint64_t	starttime, endtime;
	bool 		network_error;
	int 		fd = -1;
	int		rv = CITRUSLEAF_FAIL_CLIENT;	//Assume that this is a failure;

	// as_msg 		msg;
	cf_digest	d_ret;
	cl_cluster_node	*node = 0;

#if ONEASYNCFD
	if (shash_get_size(g_cl_async_hashtab) >= g_async_h_szlimit) {
		//cf_error("Async hashtab is full. Cannot insert any more elements");
		return CITRUSLEAF_FAIL_ASYNCQ_FULL;
	}
#else
	//If the async buffer is at the max limit, do not entertain more requests.
	if (cf_queue_sz(g_cl_async_q) >= cf_atomic32_get(g_async_q_szlimit)) {
		//cf_error("Async buffer is full. Cannot insert any more elements");
		return CITRUSLEAF_FAIL_ASYNCQ_FULL;
	}
#endif

	//Allocate memory for work item that will be added to the async work list

	if (cf_queue_sz(g_cl_workitems_freepool_q) > 0) {
		cf_queue_pop(g_cl_workitems_freepool_q, &workitem, CF_QUEUE_FOREVER);
	} else {
		workitem = malloc(sizeof(cl_async_work));
		if (workitem == NULL) {
			return CITRUSLEAF_FAIL_CLIENT;
		}
	}

	//Compile the write buffer to be sent to the cluster
	if (n_values && ( values || operations) ){
		cl_compile(info1, info2, 0, ns, set, key, digest, values?*values:NULL, operator, operations?*operations:NULL,
				*n_values , &wr_buf, &wr_buf_sz, cl_w_p, &d_ret, *trid,NULL,NULL, 0 /*udf_type*/);
	}else{
		cl_compile(info1, info2, 0, ns, set, key, digest, 0, 0, 0, 0, &wr_buf, &wr_buf_sz, cl_w_p, &d_ret, *trid,NULL,NULL, 0 /*udf_type*/);
	}	

	deadline_ms = 0;
	progress_timeout_ms = 0;
	if (cl_w_p && cl_w_p->timeout_ms) {
		deadline_ms = cf_getms() + cl_w_p->timeout_ms;
		// policy: if asking for a long timeout, give enough time to try twice
		if (cl_w_p->timeout_ms > 700) {
			progress_timeout_ms = cl_w_p->timeout_ms / 2;
		}
		else {
			progress_timeout_ms = cl_w_p->timeout_ms;
		}
	}
	else {
		progress_timeout_ms = g_async_nw_progress_timeout;
	}

	//Initialize the async work unit
	workitem->trid = *trid;
	workitem->deadline = deadline_ms;
	workitem->starttime = cf_getms();
	workitem->udata = udata;

    as_msg *msgp;
    // Hate special cases, but we have to clear the verify bit on delete verify
    if ( (info2 & CL_MSG_INFO2_DELETE) && (info1 & CL_MSG_INFO1_VERIFY))
    {
        msgp = (as_msg *)wr_buf;
        msgp->m.info1 &= ~CL_MSG_INFO1_VERIFY;
    }
    
    if (asc->compression_stat.compression_threshold > 0 
     && wr_buf_sz > (size_t)asc->compression_stat.compression_threshold)
    {
        /* Compression is enabled.
         * Packet size is above threshold.
         * Compress the data
         */
        uint8_t *compressed_buf = NULL;
        size_t compressed_buf_sz = 0;

        // Contstruct packet for compressed data.
        cf_packet_compression (wr_buf, wr_buf_sz, &compressed_buf, &compressed_buf_sz);
        if (compressed_buf)
        {
            // If original packet size is > 16k, cl_compile had allocated memory for it.
            // Free that memory.
            // cf_packet_compression will allocate memory for compressed packet
            if (wr_buf != wr_stack_buf) {
                free(wr_buf);
            }
             // Update stats.
            citrusleaf_cluster_put_compression_stat(asc, wr_buf_sz, compressed_buf_sz);	
            wr_buf =  compressed_buf;
            wr_buf_sz = compressed_buf_sz;
            //memcpy (wr_buf, compressed_buf, compressed_buf_sz);
            //wr_buf_sz = compressed_buf_sz;
            //free (compressed_buf);
        }
        //else compression failed, continue with uncompressed packet
        else
        {
            // Set compression stat
            citrusleaf_cluster_put_compression_stat(asc, wr_buf_sz, wr_buf_sz);	
        }
    }

	int try = 0;
	// retry request based on the write_policy
	do {
		network_error = false;
		try++;
#ifdef DEBUG		
		if (try > 1) {
			cf_debug("request retrying try %d tid %zu", try, (uint64_t)pthread_self());
		}
#endif        

		// Get an FD from a cluster. First get the probable node for the given digest.
		node = cl_cluster_node_get(asc, ns, &d_ret, info2 & CL_MSG_INFO2_WRITE ? true : false);
		if (!node) {
#ifdef DEBUG
			cf_debug("warning: no healthy nodes in cluster, retrying");
#endif
			usleep(10000);	//Sleep for 10ms
			goto Retry;
		}

		// Now get the dedicated async FD of this node
		starttime = cf_getms();
		fd = cl_cluster_node_fd_get(node, true);
		endtime = cf_getms();
		if ((endtime - starttime) > 10) {
			cf_debug("Time to get FD for a node (>10ms)=%"PRIu64, (endtime - starttime));
		}
		if (fd == -1) {
#ifdef DEBUG			
			cf_debug("warning: node %s has no async file descriptors, retrying transaction (tid %zu)",node->name,(uint64_t)pthread_self() );
#endif			
			usleep(1000);
			goto Retry;
		}

		// Send the command to the node
		starttime = cf_getms();
		rv = cf_socket_write_timeout(fd, wr_buf, wr_buf_sz, deadline_ms, progress_timeout_ms);
		endtime = cf_getms();
		if ((endtime - starttime) > 10) {
			cf_debug("Time to write to the socket (>10ms)=%"PRIu64, (endtime - starttime));
		}
		if (rv != 0) {
			cf_debug("Citrusleaf: write timeout or error when writing header to server - %d fd %d errno %d (tid %zu)",
					rv,fd,errno,(uint64_t)pthread_self());
			if (rv != ETIMEDOUT)
				network_error = true;
			goto Retry;
		}
		goto Ok;

Retry:
		if (network_error == true) {
			/* 
			 * In case of Async work (for XDS), it may be extreme to
			 * dun a node in case of network error. We just cleanup
			 * things and retry to connect to the remote cluster.
			 * The network error may be a transient one. As this is a
			 * network error, its is better to wait for some significant
			 * time before retrying.
			 */
			sleep(1);	//Sleep for 1sec
#if ONEASYNCFD
//Do not close the FD
#else
			cf_error("async sender: Closing the fd %d because of network error", fd);
			cf_close(fd);
			fd = -1;
#endif
		}

		if (fd != -1) {
			cf_error("async sender: Closing the fd %d because of retry", fd);
			cf_close(fd);
			fd = -1;
		}

		if (node) {
			cl_cluster_node_put(node); 
			node = 0; 
		}

		if (deadline_ms && (deadline_ms < cf_getms() ) ) {
#ifdef DEBUG            
			cf_debug("async sender: out of time : deadline %"PRIu64" now %"PRIu64, deadline_ms, cf_getms());
#endif            
			rv = CITRUSLEAF_FAIL_TIMEOUT;
			goto Error;
		}
	} while ( (cl_w_p == 0) || (cl_w_p->w_pol == CL_WRITE_RETRY) );

Error:	
#ifdef DEBUG	
	cf_debug("exiting with failure: network_error %d wpol %d timeleft %d rv %d",
			(int)network_error, (int)(cl_w_p ? cl_w_p->w_pol : 0), 
			(int)(deadline_ms - cf_getms() ), rv );
#endif	

	if (wr_buf != wr_stack_buf) {
		free(wr_buf);
	}

#if ONEASYNCFD
	//Do not close the FD
#else
	//If it is a network error, the fd would be closed and set to -1.
	//So, we reach this place with a valid FD in case of timeout.
	if (fd != -1) {
		cf_error("async sender: Closing the fd %d because of timeout", fd);
		cf_close(fd);
	}
#endif

	return(rv);
Ok:
	/*
	 * We cannot release the node here as the asyc FD associated
	 * with this node may get closed. We should do it only when
	 * we got back the ack for the async command that we just did.
	 */

	//As we sent the command successfully, add it to the async work list
	workitem->node = node;
	workitem->fd = fd;
	//We are storing only the pointer to the workitem
#if ONEASYNCFD
	if (shash_put_unique(g_cl_async_hashtab, trid, &workitem) != SHASH_OK) {
		//This should always succeed.
		cf_error("Unable to add unique entry into the hash table");
	}
	cf_queue_push(node->asyncwork_q, &workitem);	//Also put in the node's q
#else
	cf_queue_push(g_cl_async_q, &workitem);
#endif

	if (wr_buf != wr_stack_buf) {
		free(wr_buf);
	}

	rv = CITRUSLEAF_OK;
	return rv;

}

int citrusleaf_async_reinit(int size_limit, unsigned int num_receiver_threads)
{
	// int num_threads;

	if (0 == cf_atomic32_get(g_async_initialized)) {
		cf_error("Async client not initialized cannot reinit");
		return -1;
	}
	
	if (num_receiver_threads > MAX_ASYNC_RECEIVER_THREADS) {
			//Limit the threads to the max value even if caller asks for it
			num_receiver_threads = MAX_ASYNC_RECEIVER_THREADS;
	}

	// If number of thread is increased create more threads
	if (num_receiver_threads > g_async_num_threads) {
		unsigned int i;
		for (i = g_async_num_threads; i < num_receiver_threads; i++) {
			pthread_create(&g_async_reciever[i], 0, async_receiver_fn, NULL);
		}
	}
	else {
		// else just reset the number the async threads will kill themselves
		cf_atomic32_set(&g_async_num_threads, num_receiver_threads);
	}

	cf_atomic32_set(&g_async_q_szlimit , size_limit);
	return ( 0 );

}
int citrusleaf_async_init(int size_limit, int num_receiver_threads, cl_async_fail_cb fail_cb_fn, cl_async_success_cb success_cb_fn)
{
	int i, num_threads;

	//Make sure that we do the initialization only once
	if (1 == cf_atomic32_incr(&g_async_initialized)) {

		// Start the receiver threads
		num_threads = num_receiver_threads;
		if (num_threads > MAX_ASYNC_RECEIVER_THREADS) {
			//Limit the threads to the max value even if caller asks for it
			num_threads = MAX_ASYNC_RECEIVER_THREADS;
		}

#if ONEASYNCFD
		g_async_h_szlimit = size_limit * 3;	//Max number of elements in the hash table
		g_async_h_buckets = g_async_h_szlimit/10;//Number of buckets in the hash table

		if (shash_create(&g_cl_async_hashtab, async_trid_hash, sizeof(uint64_t), sizeof(cl_async_work *),
					g_async_h_buckets, SHASH_CR_MT_BIGLOCK) != SHASH_OK) {
			cf_error("Failed to initialize the async work hastable");
			cf_atomic32_decr(&g_async_initialized);
			return -1;
		}
#else
		// create work queue
		g_async_q_szlimit = size_limit;
		if ((g_cl_async_q = cf_queue_create(sizeof(cl_async_work *), true)) == NULL) {
			cf_error("Failed to initialize the async work queue");
			cf_atomic32_decr(&g_async_initialized);
			return -1;
		}

		for (i=0; i<num_threads; i++) {
			pthread_create(&g_async_reciever[i], 0, async_receiver_fn, NULL);
		}
		g_async_num_threads = num_threads;
#endif

		if ((g_cl_workitems_freepool_q = cf_queue_create(sizeof(cl_async_work *), true)) == NULL) {
			cf_error("Failed to create memory pool for workitems");
			return -1;
		}

		g_fail_cb_fn = fail_cb_fn;
		g_success_cb_fn = success_cb_fn;

		// Initialize the stats
		g_async_stats.retries = 0;
		g_async_stats.dropouts = 0;

	}
	
	return(0);	
}
示例#24
0
static void* 
async_receiver_fn(void *thdata)
{
	int 		rv = -1;
	bool 		network_error = false;
	cl_async_work	*workitem = NULL;
	// cl_async_work	*tmpworkitem = NULL;
	as_msg 		msg;
	cf_queue	*q_to_use = NULL;
	cl_cluster_node	*thisnode = NULL;

	uint8_t		rd_stack_buf[STACK_BUF_SZ];	
	uint8_t		*rd_buf = rd_stack_buf;
	size_t		rd_buf_sz = 0;

	uint64_t	acktrid;
	// uint64_t	starttime, endtime;
	int		progress_timeout_ms;
	unsigned int 	thread_id = cf_atomic32_incr(&g_thread_count);

	if (thdata == NULL) {
		q_to_use = g_cl_async_q;
	} else {
		thisnode = (cl_cluster_node *)thdata;
		q_to_use = thisnode->asyncwork_q;
	}
    
	//Infinite loop which keeps picking work items from the list and try to find the end result 
	while(1) {
		network_error = false;
#if ONEASYNCFD
		if(thisnode->dunned == true) {
			do {
				rv = cf_queue_pop(thisnode->asyncwork_q, &workitem, CF_QUEUE_NOWAIT);
				if (rv == CF_QUEUE_OK) {
					cl_cluster_node_put(thisnode);
					free(workitem);
				}
			} while (rv == CF_QUEUE_OK);

			//We want to delete all the workitems of this node
			shash_reduce_delete(g_cl_async_hashtab, cl_del_node_asyncworkitems, thisnode);
			break;
		}
#endif
		//This call will block if there is no element in the queue
		cf_queue_pop(q_to_use, &workitem, CF_QUEUE_FOREVER);
		//TODO: What if the node gets dunned while this pop call is blocked ?
#if ONEASYNCFD
		//cf_debug("Elements remaining in this node's queue=%d, Hash table size=%d",
		//		cf_queue_sz(thisnode->asyncwork_q), shash_get_size(g_cl_async_hashtab));
#endif

		// If we have no progress in 50ms, we should move to the next workitem 
		// and revisit this workitem at a later stage
		progress_timeout_ms = DEFAULT_PROGRESS_TIMEOUT;

		// Read into this fine cl_msg, which is the short header
		rv = cf_socket_read_timeout(workitem->fd, (uint8_t *) &msg, sizeof(as_msg), workitem->deadline, progress_timeout_ms);
		if (rv) {
#if DEBUG
			cf_debug("Citrusleaf: error when reading header from server - rv %d fd %d", rv, workitem->fd);
#endif
			if (rv != ETIMEDOUT) {
				cf_error("Citrusleaf: error when reading header from server - rv %d fd %d",rv,workitem->fd);
				network_error = true;
				goto Error;
			} else {
				goto Retry;
			}

		}
#ifdef DEBUG_VERBOSE
		dump_buf("read header from cluster", (uint8_t *) &msg, sizeof(cl_msg));
#endif
		cl_proto_swap(&msg.proto);
		cl_msg_swap_header(&msg.m);

		// second read for the remainder of the message 
		rd_buf_sz =  msg.proto.sz  - msg.m.header_sz;
		if (rd_buf_sz > 0) {
			if (rd_buf_sz > sizeof(rd_stack_buf)) {
				rd_buf = malloc(rd_buf_sz);
				if (!rd_buf) {
					cf_error("malloc fail: trying %zu",rd_buf_sz);
					rv = -1; 
					goto Error; 
				}
			}

			rv = cf_socket_read_timeout(workitem->fd, rd_buf, rd_buf_sz, workitem->deadline, progress_timeout_ms);
			if (rv) {
				//We already read some part of the message before but failed to read the
				//remaining data for whatever reason (network error or timeout). We cannot
				//reread as we already read partial data. Declare this as error.
				cf_error("Timeout after reading the header but before reading the body");
				goto Error;
			}
#ifdef DEBUG_VERBOSE
			dump_buf("read body from cluster", rd_buf, rd_buf_sz);
#endif	
		}

		rv = CITRUSLEAF_OK;
		goto Ok;

Retry:
		//We are trying to postpone the reading
		if (workitem->deadline && workitem->deadline < cf_getms()) {
			cf_error("async receiver: out of time : deadline %"PRIu64" now %"PRIu64,
					workitem->deadline, cf_getms());
			//cf_error("async receiver: Workitem missed the final deadline");
			rv = CITRUSLEAF_FAIL_TIMEOUT;
			goto Error;
		} else {
			//We have time. Push the element back to the queue to be considered later
			cf_queue_push(q_to_use, &workitem);
		}

		//If we allocated memory in this loop, release it.
		if (rd_buf && (rd_buf != rd_stack_buf)) {
			free(rd_buf);
		}

		cf_atomic_int_incr(&g_async_stats.retries);

		continue;

Error:
		if (network_error == true) {
			/* 
			 * In case of Async work (for XDS), it may be extreme to
			 * dun a node in case of network error. We just cleanup
			 * things and retry to connect to the remote cluster.
			 * The network error may be a transient one.
			 */
		} 

#if ONEASYNCFD
//Do not close FD
#else
		//We do not know the state of FD. It may have pending data to be read.
		//We cannot reuse the FD. So, close it to be on safe side.
		cf_error("async receiver: Closing the fd %d because of error", workitem->fd);
		cf_close(workitem->fd);
		workitem->fd = -1;
#endif
		cf_atomic_int_incr(&g_async_stats.dropouts);
		//Continue down with what we do during an Ok

		//Inform the caller that there is no response from the server for this workitem.
		//No response does not mean that the work is not done. The work might be 
		//successfully completed on the server side, we just didnt get response for it.
		if (g_fail_cb_fn) {
			g_fail_cb_fn(workitem->udata, rv, workitem->starttime);
		}
Ok:
		//rd_buf may not be there during an error condition.
		if (rd_buf && (rv == CITRUSLEAF_OK)) {
			//As of now, async functionality is there only for put call.
			//In put call, we do not get anything back other than the trid field.
			//So, just pass variable to get back the trid and ignore others.
			if (0 != cl_parse(&msg.m, rd_buf, rd_buf_sz, NULL, NULL, NULL, &acktrid, NULL)) {
				rv = CITRUSLEAF_FAIL_UNKNOWN;
			}
			else {
				rv = msg.m.result_code;
				if (workitem->trid != acktrid) {
#if ONEASYNCFD
					//It is likely that we may get response for a different trid.
					//Just delete the correct one from the queue 
					//put back the current workitem back in the queue.
					shash_get(g_cl_async_hashtab, &acktrid, &tmpworkitem);
					cf_queue_delete(q_to_use, &tmpworkitem, true);
					cf_queue_push(q_to_use, &workitem);
					//From now on workitem will be the one for which we got ack
					workitem = tmpworkitem;
#endif
#ifdef DEBUG
					cf_debug("Got reply for a different trid. Expected=%"PRIu64" Got=%"PRIu64" FD=%d",
							workitem->trid, acktrid, workitem->fd);
#endif
				}
			}

			if (g_success_cb_fn) {
				g_success_cb_fn(workitem->udata, rv, workitem->starttime);
			}
		}

		//Remember to put back the FD into the pool, if it is re-usable.
		if (workitem->fd != -1) {
			cl_cluster_node_fd_put(workitem->node, workitem->fd, true);
		}
		//Also decrement the reference count for this node
		cl_cluster_node_put(workitem->node);

#if ONEASYNCFD
		//Delete the item from the global hashtable
		if (shash_delete(g_cl_async_hashtab, &workitem->trid) != SHASH_OK)
		{
#if DEBUG
			cf_debug("Failure while trying to delete trid=%"PRIu64" from hashtable", workitem->trid);
#endif
		}
#endif

		//Push it back into the free pool. If the attempt fails, free it.
		if (cf_queue_push(g_cl_workitems_freepool_q, &workitem) == -1) {
			free(workitem);
		}

		//If we allocated memory in this loop, release it.
		if (rd_buf && (rd_buf != rd_stack_buf)) {
			free(rd_buf);
		}

		// Kick this thread out if its ID is greater than total
		if (thread_id > cf_atomic32_get(g_async_num_threads)) {
			cf_atomic32_decr(&g_thread_count);
			return NULL;
		}
	}//The infnite loop

	return NULL;
}
示例#25
0
/* cf_queue_pop
 * if ms_wait < 0, wait forever
 * if ms_wait = 0, don't wait at all
 * if ms_wait > 0, wait that number of ms
 * */
int cf_queue_pop(cf_queue *q, void *buf, int ms_wait) {
	if (NULL == q) {
		cf_error("cf_queue_pop: try passing in a queue");
		return(-1);
	}

#ifdef EXTERNAL_LOCKS 
	if (ms_wait != CF_QUEUE_NOWAIT) {   // this implementation won't wait
		cf_error("cf_queue_pop: only nowait supported");
		return(-1);
	}
#endif // EXTERNAL_LOCKS

	QUEUE_LOCK(q);

	struct timespec tp;
	if (ms_wait > 0) {
#ifdef OSX
		uint64_t curms = cf_getms(); // using the cl generic functions defined in cf_clock.h. It is going to have slightly less resolution than the pure linux version
		tp.tv_sec = (curms + ms_wait)/1000;
		tp.tv_nsec = (ms_wait %1000) * 1000000;
#else // linux
		clock_gettime( CLOCK_REALTIME, &tp); 
		tp.tv_sec += ms_wait / 1000;
		tp.tv_nsec += (ms_wait % 1000) * 1000000;
		if (tp.tv_nsec > 1000000000) {
			tp.tv_nsec -= 1000000000;
			tp.tv_sec++;
		}
#endif
	}

	/* FIXME error checking */
	/* Note that we apparently have to use a while() loop.  Careful reading
	 * of the pthread_cond_signal() documentation says that AT LEAST ONE
	 * waiting thread will be awakened... */
	if (q->threadsafe) {
#ifdef EXTERNAL_LOCKS
		if (CF_Q_EMPTY(q)) {
			QUEUE_UNLOCK(q);
			return(CF_QUEUE_EMPTY);
		}
#else
		while (CF_Q_EMPTY(q)) {
			if (CF_QUEUE_FOREVER == ms_wait) {
				pthread_cond_wait(&q->CV, &q->LOCK);
			}
			else if (CF_QUEUE_NOWAIT == ms_wait) {
				pthread_mutex_unlock(&q->LOCK);
				return(CF_QUEUE_EMPTY);
			}
			else {
				pthread_cond_timedwait(&q->CV, &q->LOCK, &tp);
				if (CF_Q_EMPTY(q)) {
					pthread_mutex_unlock(&q->LOCK);
					return(CF_QUEUE_EMPTY);
				}
			}
		}
#endif // EXTERNAL_LOCKS
	} else if (CF_Q_EMPTY(q))
		return(CF_QUEUE_EMPTY);

	memcpy(buf, CF_Q_ELEM_PTR(q,q->read_offset), q->elementsz);
	q->read_offset++;
	
	// interesting idea - this probably keeps the cache fresher
	// because the queue is fully empty just make it all zero
	if (q->read_offset == q->write_offset) {
		q->read_offset = q->write_offset = 0;
	}

	QUEUE_UNLOCK(q);

	return(0);
}
// Set of threads which talk to client over the connection for doing the needful
// processing. Note that once fd is assigned to a thread all the work on that fd
// is done by that thread. Fair fd usage is expected of the client. First thread
// is special - also does accept [listens for new connections]. It is the only
// thread which does it.
void *
thr_demarshal(void *arg)
{
	cf_socket_cfg *s, *ls;
	// Create my epoll fd, register in the global list.
	struct epoll_event ev;
	int nevents, i, n, epoll_fd;
	cf_clock last_fd_print = 0;

#if defined(USE_SYSTEMTAP)
	uint64_t nodeid = g_config.self_node;
#endif

	// Early stage aborts; these will cause faults in process scope.
	cf_assert(arg, AS_DEMARSHAL, CF_CRITICAL, "invalid argument");
	s = &g_config.socket;
	ls = &g_config.localhost_socket;

#ifdef USE_JEM
	int orig_arena;
	if (0 > (orig_arena = jem_get_arena())) {
		cf_crash(AS_DEMARSHAL, "Failed to get original arena for thr_demarshal()!");
	} else {
		cf_info(AS_DEMARSHAL, "Saved original JEMalloc arena #%d for thr_demarshal()", orig_arena);
	}
#endif

	// Figure out my thread index.
	pthread_t self = pthread_self();
	int thr_id;
	for (thr_id = 0; thr_id < MAX_DEMARSHAL_THREADS; thr_id++) {
		if (0 != pthread_equal(g_demarshal_args->dm_th[thr_id], self))
			break;
	}

	if (thr_id == MAX_DEMARSHAL_THREADS) {
		cf_debug(AS_FABRIC, "Demarshal thread could not figure own ID, bogus, exit, fu!");
		return(0);
	}

	// First thread accepts new connection at interface socket.
	if (thr_id == 0) {
		demarshal_file_handle_init();
		epoll_fd = epoll_create(EPOLL_SZ);
		if (epoll_fd == -1)
			cf_crash(AS_DEMARSHAL, "epoll_create(): %s", cf_strerror(errno));

		memset(&ev, 0, sizeof (ev));
		ev.events = EPOLLIN | EPOLLERR | EPOLLHUP;
		ev.data.fd = s->sock;
		if (0 > epoll_ctl(epoll_fd, EPOLL_CTL_ADD, s->sock, &ev))
			cf_crash(AS_DEMARSHAL, "epoll_ctl(): %s", cf_strerror(errno));
		cf_info(AS_DEMARSHAL, "Service started: socket %s:%d", s->addr, s->port);

		if (ls->sock) {
			ev.events = EPOLLIN | EPOLLERR | EPOLLHUP;
			ev.data.fd = ls->sock;
			if (0 > epoll_ctl(epoll_fd, EPOLL_CTL_ADD, ls->sock, &ev))
			  cf_crash(AS_DEMARSHAL, "epoll_ctl(): %s", cf_strerror(errno));
			cf_info(AS_DEMARSHAL, "Service also listening on localhost socket %s:%d", ls->addr, ls->port);
		}
	}
	else {
		epoll_fd = epoll_create(EPOLL_SZ);
		if (epoll_fd == -1)
			cf_crash(AS_DEMARSHAL, "epoll_create(): %s", cf_strerror(errno));
	}

	g_demarshal_args->epoll_fd[thr_id] = epoll_fd;
	cf_detail(AS_DEMARSHAL, "demarshal thread started: id %d", thr_id);

	int id_cntr = 0;

	// Demarshal transactions from the socket.
	for ( ; ; ) {
		struct epoll_event events[EPOLL_SZ];

		cf_detail(AS_DEMARSHAL, "calling epoll");

		nevents = epoll_wait(epoll_fd, events, EPOLL_SZ, -1);

		if (0 > nevents) {
			cf_debug(AS_DEMARSHAL, "epoll_wait() returned %d ; errno = %d (%s)", nevents, errno, cf_strerror(errno));
		}

		cf_detail(AS_DEMARSHAL, "epoll event received: nevents %d", nevents);

		uint64_t now_ns = cf_getns();
		uint64_t now_ms = now_ns / 1000000;

		// Iterate over all events.
		for (i = 0; i < nevents; i++) {
			if ((s->sock == events[i].data.fd) || (ls->sock == events[i].data.fd)) {
				// Accept new connections on the service socket.
				int csocket = -1;
				struct sockaddr_in caddr;
				socklen_t clen = sizeof(caddr);
				char cpaddr[64];

				if (-1 == (csocket = accept(events[i].data.fd, (struct sockaddr *)&caddr, &clen))) {
					// This means we're out of file descriptors - could be a SYN
					// flood attack or misbehaving client. Eventually we'd like
					// to make the reaper fairer, but for now we'll just have to
					// ignore the accept error and move on.
					if ((errno == EMFILE) || (errno == ENFILE)) {
						if (last_fd_print != (cf_getms() / 1000L)) {
							cf_info(AS_DEMARSHAL, " warning: hit OS file descript limit (EMFILE on accept), consider raising limit");
							last_fd_print = cf_getms() / 1000L;
						}
						continue;
					}
					cf_crash(AS_DEMARSHAL, "accept: %s (errno %d)", cf_strerror(errno), errno);
				}

				// Get the client IP address in string form.
				if (caddr.sin_family == AF_INET) {
					if (NULL == inet_ntop(AF_INET, &caddr.sin_addr.s_addr, (char *)cpaddr, sizeof(cpaddr))) {
						cf_crash(AS_DEMARSHAL, "inet_ntop(): %s (errno %d)", cf_strerror(errno), errno);
					}
				}
				else if (caddr.sin_family == AF_INET6) {
					struct sockaddr_in6* addr_in6 = (struct sockaddr_in6*)&caddr;

					if (NULL == inet_ntop(AF_INET6, &addr_in6->sin6_addr, (char *)cpaddr, sizeof(cpaddr))) {
						cf_crash(AS_DEMARSHAL, "inet_ntop(): %s (errno %d)", cf_strerror(errno), errno);
					}
				}
				else {
					cf_crash(AS_DEMARSHAL, "unknown address family %u", caddr.sin_family);
				}

				cf_detail(AS_DEMARSHAL, "new connection: %s (fd %d)", cpaddr, csocket);

				// Validate the limit of protocol connections we allow.
				uint32_t conns_open = g_config.proto_connections_opened - g_config.proto_connections_closed;
				if (conns_open > g_config.n_proto_fd_max) {
					if ((last_fd_print + 5000L) < cf_getms()) { // no more than 5 secs
						cf_warning(AS_DEMARSHAL, "dropping incoming client connection: hit limit %d connections", conns_open);
						last_fd_print = cf_getms();
					}
					shutdown(csocket, SHUT_RDWR);
					close(csocket);
					csocket = -1;
					continue;
				}

				// Set the socket to nonblocking.
				if (-1 == cf_socket_set_nonblocking(csocket)) {
					cf_info(AS_DEMARSHAL, "unable to set client socket to nonblocking mode");
					shutdown(csocket, SHUT_RDWR);
					close(csocket);
					csocket = -1;
					continue;
				}

				// Create as_file_handle and queue it up in epoll_fd for further
				// communication on one of the demarshal threads.
				as_file_handle *fd_h = cf_rc_alloc(sizeof(as_file_handle));
				if (!fd_h) {
					cf_crash(AS_DEMARSHAL, "malloc");
				}

				sprintf(fd_h->client, "%s:%d", cpaddr, ntohs(caddr.sin_port));
				fd_h->fd = csocket;

				fd_h->last_used = cf_getms();
				fd_h->reap_me = false;
				fd_h->trans_active = false;
				fd_h->proto = 0;
				fd_h->proto_unread = 0;
				fd_h->fh_info = 0;
				fd_h->security_filter = as_security_filter_create();

				// Insert into the global table so the reaper can manage it. Do
				// this before queueing it up for demarshal threads - once
				// EPOLL_CTL_ADD is done it's difficult to back out (if insert
				// into global table fails) because fd state could be anything.
				cf_rc_reserve(fd_h);

				pthread_mutex_lock(&g_file_handle_a_LOCK);

				int j;
				bool inserted = true;

				if (0 != cf_queue_pop(g_freeslot, &j, CF_QUEUE_NOWAIT)) {
					inserted = false;
				}
				else {
					g_file_handle_a[j] = fd_h;
				}

				pthread_mutex_unlock(&g_file_handle_a_LOCK);

				if (!inserted) {
					cf_info(AS_DEMARSHAL, "unable to add socket to file handle table");
					shutdown(csocket, SHUT_RDWR);
					close(csocket);
					csocket = -1;
					cf_rc_free(fd_h); // will free even with ref-count of 2
				}
				else {
					// Place the client socket in the event queue.
					memset(&ev, 0, sizeof(ev));
					ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP ;
					ev.data.ptr = fd_h;

					// Round-robin pick up demarshal thread epoll_fd and add
					// this new connection to epoll.
					int id;
					while (true) {
						id = (id_cntr++) % g_demarshal_args->num_threads;
						if (g_demarshal_args->epoll_fd[id] != 0) {
							break;
						}
					}

					fd_h->epoll_fd = g_demarshal_args->epoll_fd[id];

					if (0 > (n = epoll_ctl(fd_h->epoll_fd, EPOLL_CTL_ADD, csocket, &ev))) {
						cf_info(AS_DEMARSHAL, "unable to add socket to event queue of demarshal thread %d %d", id, g_demarshal_args->num_threads);
						pthread_mutex_lock(&g_file_handle_a_LOCK);
						fd_h->reap_me = true;
						as_release_file_handle(fd_h);
						fd_h = 0;
						pthread_mutex_unlock(&g_file_handle_a_LOCK);
					}
					else {
						cf_atomic_int_incr(&g_config.proto_connections_opened);
					}
				}
			}
			else {
				bool has_extra_ref   = false;
				as_file_handle *fd_h = events[i].data.ptr;
				if (fd_h == 0) {
					cf_info(AS_DEMARSHAL, "event with null handle, continuing");
					goto NextEvent;
				}

				cf_detail(AS_DEMARSHAL, "epoll connection event: fd %d, events 0x%x", fd_h->fd, events[i].events);

				// Process data on an existing connection: this might be more
				// activity on an already existing transaction, so we have some
				// state to manage.
				as_proto *proto_p = 0;
				int fd = fd_h->fd;

				if (events[i].events & (EPOLLRDHUP | EPOLLERR | EPOLLHUP)) {
					cf_detail(AS_DEMARSHAL, "proto socket: remote close: fd %d event %x", fd, events[i].events);
					// no longer in use: out of epoll etc
					goto NextEvent_FD_Cleanup;
				}

				if (fd_h->trans_active) {
					goto NextEvent;
				}

				// If pointer is NULL, then we need to create a transaction and
				// store it in the buffer.
				if (fd_h->proto == NULL) {
					as_proto proto;
					int sz;

					/* Get the number of available bytes */
					if (-1 == ioctl(fd, FIONREAD, &sz)) {
						cf_info(AS_DEMARSHAL, "unable to get number of available bytes");
						goto NextEvent_FD_Cleanup;
					}

					// If we don't have enough data to fill the message buffer,
					// just wait and we'll come back to this one. However, we'll
					// let messages with zero size through, since they are
					// likely errors. We don't cleanup the FD in this case since
					// we'll get more data on it.
					if (sz < sizeof(as_proto) && sz != 0) {
						goto NextEvent;
					}

					// Do a preliminary read of the header into a stack-
					// allocated structure, so that later on we can allocate the
					// entire message buffer.
					if (0 >= (n = cf_socket_recv(fd, &proto, sizeof(as_proto), MSG_WAITALL))) {
						cf_detail(AS_DEMARSHAL, "proto socket: read header fail: error: rv %d sz was %d errno %d", n, sz, errno);
						goto NextEvent_FD_Cleanup;
					}

					if (proto.version != PROTO_VERSION &&
							// For backward compatibility, allow version 0 with
							// security messages.
							! (proto.version == 0 && proto.type == PROTO_TYPE_SECURITY)) {
						cf_warning(AS_DEMARSHAL, "proto input from %s: unsupported proto version %u",
								fd_h->client, proto.version);
						goto NextEvent_FD_Cleanup;
					}

					// Swap the necessary elements of the as_proto.
					as_proto_swap(&proto);

					if (proto.sz > PROTO_SIZE_MAX) {
						cf_warning(AS_DEMARSHAL, "proto input from %s: msg greater than %d, likely request from non-Aerospike client, rejecting: sz %"PRIu64,
								fd_h->client, PROTO_SIZE_MAX, proto.sz);
						goto NextEvent_FD_Cleanup;
					}

#ifdef USE_JEM
					// Attempt to peek the namespace and set the JEMalloc arena accordingly.
					size_t peeked_data_sz = 0;
					size_t min_field_sz = sizeof(uint32_t) + sizeof(char);
					size_t min_as_msg_sz = sizeof(as_msg) + min_field_sz;
					size_t peekbuf_sz = 2048; // (Arbitrary "large enough" size for peeking the fields of "most" AS_MSGs.)
					uint8_t peekbuf[peekbuf_sz];
					if (PROTO_TYPE_AS_MSG == proto.type) {
						size_t offset = sizeof(as_msg);
						// Number of bytes to peek from the socket.
//						size_t peek_sz = peekbuf_sz;                 // Peak up to the size of the peek buffer.
						size_t peek_sz = MIN(proto.sz, peekbuf_sz);  // Peek only up to the minimum necessary number of bytes.
						if (!(peeked_data_sz = cf_socket_recv(fd, peekbuf, peek_sz, 0))) {
							// That's actually legitimate. The as_proto may have gone into one
							// packet, the as_msg into the next one, which we haven't yet received.
							// This just "never happened" without async.
							cf_detail(AS_DEMARSHAL, "could not peek the as_msg header, expected %zu byte(s)", peek_sz);
						}
						if (peeked_data_sz > min_as_msg_sz) {
//							cf_debug(AS_DEMARSHAL, "(Peeked %zu bytes.)", peeked_data_sz);
							if (peeked_data_sz > proto.sz) {
								cf_warning(AS_DEMARSHAL, "Received unexpected extra data from client %s socket %d when peeking as_proto!", fd_h->client, fd);
								log_as_proto_and_peeked_data(&proto, peekbuf, peeked_data_sz);
								goto NextEvent_FD_Cleanup;
							}

							if (((as_msg*)peekbuf)->info1 & AS_MSG_INFO1_BATCH) {
								jem_set_arena(orig_arena);
							} else {
								uint16_t n_fields = ntohs(((as_msg *) peekbuf)->n_fields), field_num = 0;
								bool found = false;
	//							cf_debug(AS_DEMARSHAL, "Found %d AS_MSG fields", n_fields);
								while (!found && (field_num < n_fields)) {
									as_msg_field *field = (as_msg_field *) (&peekbuf[offset]);
									uint32_t value_sz = ntohl(field->field_sz) - 1;
	//								cf_debug(AS_DEMARSHAL, "Field #%d offset: %lu", field_num, offset);
	//								cf_debug(AS_DEMARSHAL, "\tvalue_sz %u", value_sz);
	//								cf_debug(AS_DEMARSHAL, "\ttype %d", field->type);
									if (AS_MSG_FIELD_TYPE_NAMESPACE == field->type) {
										if (value_sz >= AS_ID_NAMESPACE_SZ) {
											cf_warning(AS_DEMARSHAL, "namespace too long (%u) in as_msg", value_sz);
											goto NextEvent_FD_Cleanup;
										}
										char ns[AS_ID_NAMESPACE_SZ];
										found = true;
										memcpy(ns, field->data, value_sz);
										ns[value_sz] = '\0';
	//									cf_debug(AS_DEMARSHAL, "Found ns \"%s\" in field #%d.", ns, field_num);
										jem_set_arena(as_namespace_get_jem_arena(ns));
									} else {
	//									cf_debug(AS_DEMARSHAL, "Message field %d is not namespace (type %d) ~~ Reading next field", field_num, field->type);
										field_num++;
										offset += sizeof(as_msg_field) + value_sz;
										if (offset >= peeked_data_sz) {
											break;
										}
									}
								}
								if (!found) {
									cf_warning(AS_DEMARSHAL, "Can't get namespace from AS_MSG (peeked %zu bytes) ~~ Using default thr_demarshal arena.", peeked_data_sz);
									jem_set_arena(orig_arena);
								}
							}
						} else {
							jem_set_arena(orig_arena);
						}
					} else {
						jem_set_arena(orig_arena);
					}
#endif

					// Allocate the complete message buffer.
					proto_p = cf_malloc(sizeof(as_proto) + proto.sz);

					cf_assert(proto_p, AS_DEMARSHAL, CF_CRITICAL, "allocation: %zu %s", (sizeof(as_proto) + proto.sz), cf_strerror(errno));
					memcpy(proto_p, &proto, sizeof(as_proto));

#ifdef USE_JEM
					// Jam in the peeked data.
					if (peeked_data_sz) {
						memcpy(proto_p->data, &peekbuf, peeked_data_sz);
					}
					fd_h->proto_unread = proto_p->sz - peeked_data_sz;
#else
					fd_h->proto_unread = proto_p->sz;
#endif
					fd_h->proto = (void *) proto_p;
				}
				else {
					proto_p = fd_h->proto;
				}

				if (fd_h->proto_unread > 0) {

					// Read the data.
					n = cf_socket_recv(fd, proto_p->data + (proto_p->sz - fd_h->proto_unread), fd_h->proto_unread, 0);
					if (0 >= n) {
						if (errno == EAGAIN) {
							continue;
						}
						cf_info(AS_DEMARSHAL, "receive socket: fail? n %d errno %d %s closing connection.", n, errno, cf_strerror(errno));
						goto NextEvent_FD_Cleanup;
					}

					// Decrement bytes-unread counter.
					cf_detail(AS_DEMARSHAL, "read fd %d (%d %d)", fd, n, fd_h->proto_unread);
					fd_h->proto_unread -= n;
				}

				// Check for a finished read.
				if (0 == fd_h->proto_unread) {

					// It's only really live if it's injecting a transaction.
					fd_h->last_used = now_ms;

					thr_demarshal_pause(fd_h); // pause reading while the transaction is in progress
					fd_h->proto = 0;
					fd_h->proto_unread = 0;

					// INIT_TR
					as_transaction tr;
					as_transaction_init(&tr, NULL, (cl_msg *)proto_p);

					cf_rc_reserve(fd_h);
					has_extra_ref   = true;
					tr.proto_fd_h   = fd_h;
					tr.start_time   = now_ns; // set transaction start time
					tr.preprocessed = false;

					if (! as_proto_is_valid_type(proto_p)) {
						cf_warning(AS_DEMARSHAL, "unsupported proto message type %u", proto_p->type);
						// We got a proto message type we don't recognize, so it
						// may not do any good to send back an as_msg error, but
						// it's the best we can do. At least we can keep the fd.
						as_transaction_demarshal_error(&tr, AS_PROTO_RESULT_FAIL_UNKNOWN);
						cf_atomic_int_incr(&g_config.proto_transactions);
						goto NextEvent;
					}

					if (g_config.microbenchmarks) {
						histogram_insert_data_point(g_config.demarshal_hist, now_ns);
						tr.microbenchmark_time = cf_getns();
					}

					// Check if it's compressed.
					if (tr.msgp->proto.type == PROTO_TYPE_AS_MSG_COMPRESSED) {
						// Decompress it - allocate buffer to hold decompressed
						// packet.
						uint8_t *decompressed_buf = NULL;
						size_t decompressed_buf_size = 0;
						int rv = 0;
						if ((rv = as_packet_decompression((uint8_t *)proto_p, &decompressed_buf, &decompressed_buf_size))) {
							cf_warning(AS_DEMARSHAL, "as_proto decompression failed! (rv %d)", rv);
							cf_warning_binary(AS_DEMARSHAL, proto_p, sizeof(as_proto) + proto_p->sz, CF_DISPLAY_HEX_SPACED, "compressed proto_p");
							as_transaction_demarshal_error(&tr, AS_PROTO_RESULT_FAIL_UNKNOWN);
							cf_atomic_int_incr(&g_config.proto_transactions);
							goto NextEvent;
						}
						// Count the packets.
						cf_atomic_int_add(&g_config.stat_compressed_pkts_received, 1);
						// Free the compressed packet since we'll be using the
						// decompressed packet from now on.
						cf_free(proto_p);
						proto_p = NULL;
						// Get original packet.
						tr.msgp = (cl_msg *)decompressed_buf;
						as_proto_swap(&(tr.msgp->proto));

						if (! as_proto_wrapped_is_valid(&tr.msgp->proto, decompressed_buf_size)) {
							cf_warning(AS_DEMARSHAL, "decompressed unusable proto: version %u, type %u, sz %lu [%lu]",
									tr.msgp->proto.version, tr.msgp->proto.type, tr.msgp->proto.sz, decompressed_buf_size);
							as_transaction_demarshal_error(&tr, AS_PROTO_RESULT_FAIL_UNKNOWN);
							cf_atomic_int_incr(&g_config.proto_transactions);
							goto NextEvent;
						}
					}

					// Security protocol transactions.
					if (tr.msgp->proto.type == PROTO_TYPE_SECURITY) {
						as_security_transact(&tr);
						cf_atomic_int_incr(&g_config.proto_transactions);
						goto NextEvent;
					}

					// Info protocol requests.
					if (tr.msgp->proto.type == PROTO_TYPE_INFO) {
						if (as_info(&tr)) {
							cf_warning(AS_DEMARSHAL, "Info request failed to be enqueued ~~ Freeing protocol buffer");
							goto NextEvent_FD_Cleanup;
						}
						cf_atomic_int_incr(&g_config.proto_transactions);
						goto NextEvent;
					}

					ASD_TRANS_DEMARSHAL(nodeid, (uint64_t) tr.msgp);

					// Fast path for batch requests.
					if (tr.msgp->msg.info1 & AS_MSG_INFO1_BATCH) {
						as_batch_queue_task(&tr);
						cf_atomic_int_incr(&g_config.proto_transactions);
						goto NextEvent;
					}

					// Either process the transaction directly in this thread,
					// or queue it for processing by another thread (tsvc/info).
					if (0 != thr_tsvc_process_or_enqueue(&tr)) {
						cf_warning(AS_DEMARSHAL, "Failed to queue transaction to the service thread");
						goto NextEvent_FD_Cleanup;
					}
					else {
						cf_atomic_int_incr(&g_config.proto_transactions);
					}
				}

				// Jump the proto message free & FD cleanup. If we get here, the
				// above operations went smoothly. The message free & FD cleanup
				// job is handled elsewhere as directed by
				// thr_tsvc_process_or_enqueue().
				goto NextEvent;

NextEvent_FD_Cleanup:
				// If we allocated memory for the incoming message, free it.
				if (proto_p) {
					cf_free(proto_p);
					fd_h->proto = 0;
				}
				// If fd has extra reference for transaction, release it.
				if (has_extra_ref) {
					cf_rc_release(fd_h);
				}
				// Remove the fd from the events list.
				if (epoll_ctl(epoll_fd, EPOLL_CTL_DEL, fd, 0) < 0) {
					cf_crash(AS_DEMARSHAL, "unable to remove socket FD %d from epoll instance FD %d: %d (%s)",
							fd, epoll_fd, errno, cf_strerror(errno));
				}
				pthread_mutex_lock(&g_file_handle_a_LOCK);
				fd_h->reap_me = true;
				as_release_file_handle(fd_h);
				fd_h = 0;
				pthread_mutex_unlock(&g_file_handle_a_LOCK);
NextEvent:
				;
			}

			// We should never be canceled externally, but just in case...
			pthread_testcancel();
		}
	}

	return NULL;
}
示例#27
0
as_status
as_command_execute(as_cluster* cluster, as_error * err, as_command_node* cn, uint8_t* command, size_t command_len,
                   uint32_t timeout_ms, uint32_t retry,
                   as_parse_results_fn parse_results_fn, void* parse_results_data
                  )
{
    uint64_t deadline_ms = as_socket_deadline(timeout_ms);
    uint32_t sleep_between_retries_ms = 0;
    uint32_t failed_nodes = 0;
    uint32_t failed_conns = 0;
    uint32_t iterations = 0;
    bool release_node;

    // Execute command until successful, timed out or maximum iterations have been reached.
    while (true) {
        as_node* node;

        if (cn->node) {
            node = cn->node;
            release_node = false;
        }
        else {
            node = as_node_get(cluster, cn->ns, cn->digest, cn->write, cn->replica);
            release_node = true;
        }

        if (!node) {
            failed_nodes++;
            sleep_between_retries_ms = 10;
            goto Retry;
        }

        int fd;
        as_status status = as_node_get_connection(err, node, deadline_ms, &fd);

        if (status) {
            if (release_node) {
                as_node_release(node);
            }
            failed_conns++;
            sleep_between_retries_ms = 1;
            goto Retry;
        }

        // Send command.
        status = as_socket_write_deadline(err, fd, command, command_len, deadline_ms);

        if (status) {
            // Socket errors are considered temporary anomalies.  Retry.
            // Close socket to flush out possible garbage.  Do not put back in pool.
            as_close(fd);
            if (release_node) {
                as_node_release(node);
            }
            sleep_between_retries_ms = 0;
            goto Retry;
        }

        // Parse results returned by server.
        status = parse_results_fn(err, fd, deadline_ms, parse_results_data);

        if (status == AEROSPIKE_OK) {
            // Reset error code if retry had occurred.
            if (iterations > 0) {
                as_error_reset(err);
            }
        }
        else {
            switch (status) {
            // Retry on timeout.
            case AEROSPIKE_ERR_TIMEOUT:
                as_close(fd);
                if (release_node) {
                    as_node_release(node);
                }
                sleep_between_retries_ms = 0;
                goto Retry;

            // Close socket on errors that can leave unread data in socket.
            case AEROSPIKE_ERR_QUERY_ABORTED:
            case AEROSPIKE_ERR_SCAN_ABORTED:
            case AEROSPIKE_ERR_CLIENT_ABORT:
            case AEROSPIKE_ERR_CLIENT:
                as_close(fd);
                if (release_node) {
                    as_node_release(node);
                }
                err->code = status;
                return status;

            default:
                err->code = status;
                break;
            }
        }

        // Put connection back in pool.
        as_node_put_connection(node, fd, cluster->conn_queue_size);

        // Release resources.
        if (release_node) {
            as_node_release(node);
        }
        return status;

Retry:
        // Check if max retries reached.
        if (++iterations > retry) {
            break;
        }

        // Check for client timeout.
        if (deadline_ms > 0) {
            int remaining_ms = (int)(deadline_ms - cf_getms() - sleep_between_retries_ms);

            if (remaining_ms <= 0) {
                break;
            }

            // Reset timeout in send buffer (destined for server).
            *(uint32_t*)(command + 22) = cf_swap_to_be32(remaining_ms);
        }

        if (sleep_between_retries_ms > 0) {
            // Sleep before trying again.
            usleep(sleep_between_retries_ms * 1000);
        }
    }

    return as_error_update(err, AEROSPIKE_ERR_TIMEOUT,
                           "Client timeout: timeout=%d iterations=%u failedNodes=%u failedConns=%u",
                           timeout_ms, iterations, failed_nodes, failed_conns);
}
示例#28
0
// Put batch request on a separate batch queue.
int
as_batch(as_transaction* tr)
{
	as_msg* msg = &tr->msgp->msg;

	as_msg_field* nsfp = as_msg_field_get(msg, AS_MSG_FIELD_TYPE_NAMESPACE);
	if (! nsfp) {
		cf_warning(AS_BATCH, "Batch namespace is required.");
		return -1;
	}

	as_msg_field* dfp = as_msg_field_get(msg, AS_MSG_FIELD_TYPE_DIGEST_RIPE_ARRAY);
	if (! dfp) {
		cf_warning(AS_BATCH, "Batch digests are required.");
		return -1;
	}

	uint n_digests = dfp->field_sz / sizeof(cf_digest);

	if (n_digests > g_config.batch_max_requests) {
		cf_warning(AS_BATCH, "Batch request size %u exceeds max %u.", n_digests, g_config.batch_max_requests);
		return -1;
	}

	batch_transaction btr;
	btr.trid = tr->trid;
	btr.end_time = tr->end_time;
	btr.get_data = !(msg->info1 & AS_MSG_INFO1_GET_NOBINDATA);

	btr.ns = as_namespace_get_bymsgfield(nsfp);
	if (! btr.ns) {
		cf_warning(AS_BATCH, "Batch namespace is required.");
		return -1;
	}

	// Create the master digest table.
	btr.digests = (batch_digests*) cf_malloc(sizeof(batch_digests) + (sizeof(batch_digest) * n_digests));
	if (! btr.digests) {
		cf_warning(AS_BATCH, "Failed to allocate memory for batch digests.");
		return -1;
	}

	batch_digests* bmd = btr.digests;
	bmd->n_digests = n_digests;
	uint8_t* digest_field_data = dfp->data;

	for (int i = 0; i < n_digests; i++) {
		bmd->digest[i].done = false;
		bmd->digest[i].node = 0;
		memcpy(&bmd->digest[i].keyd, digest_field_data, sizeof(cf_digest));
		digest_field_data += sizeof(cf_digest);
	}

	btr.binlist = as_binlist_from_op(msg);
	btr.fd_h = tr->proto_fd_h;
	tr->proto_fd_h = 0;
	btr.fd_h->last_used = cf_getms();

	cf_atomic_int_incr(&g_config.batch_initiate);
	cf_queue_push(g_batch_queue, &btr);
	return 0;
}
示例#29
0
/* cf_socket_init_client
 * Connect a socket to a remote endpoint
 * DOES A BLOCKING CONNECT INLINE - timeout
 */
int
cf_socket_init_client(cf_socket_cfg *s, int timeout)
{
    cf_assert(s, CF_SOCKET, CF_CRITICAL, "invalid argument");

    if (0 > (s->sock = socket(AF_INET, s->proto, 0))) {
        cf_warning(CF_SOCKET, "socket: %s", cf_strerror(errno));
        return(-1);
    }

    fcntl(s->sock, F_SETFD, FD_CLOEXEC);  /* close on exec */
    fcntl(s->sock, F_SETFL, O_NONBLOCK); /* non-blocking */

    // Try tuning the window: must be done before connect
//	int flag = (1024 * 32);
//	setsockopt(s->sock, SOL_SOCKET, SO_SNDBUF, &flag, sizeof(flag) );
//	setsockopt(s->sock, SOL_SOCKET, SO_RCVBUF, &flag, sizeof(flag) );

    memset(&s->saddr,0,sizeof(s->saddr));
    s->saddr.sin_family = AF_INET;
    int rv = inet_pton(AF_INET, s->addr, &s->saddr.sin_addr.s_addr);
    if (rv < 0) {
        cf_warning(CF_SOCKET, "inet_pton: %s", cf_strerror(errno));
        close(s->sock);
        return(-1);
    } else if (rv == 0) {
        cf_warning(CF_SOCKET, "inet_pton: invalid ip %s", s->addr);
        close(s->sock);
        return(-1);
    }
    s->saddr.sin_port = htons(s->port);

    rv = connect(s->sock, (struct sockaddr *)&s->saddr, sizeof(s->saddr));
    cf_debug(CF_SOCKET, "connect: rv %d errno %s",rv,cf_strerror(errno));

    if (rv < 0) {
        int epoll_fd = -1;

        if (errno == EINPROGRESS) {
            cf_clock start = cf_getms();

            if (0 > (epoll_fd = epoll_create(1))) {
                cf_warning(CF_SOCKET, "epoll_create() failed (errno %d: \"%s\")", errno, cf_strerror(errno));
                goto Fail;
            }

            struct epoll_event event;
            memset(&event, 0, sizeof(struct epoll_event));
            event.data.fd = s->sock;
            event.events = EPOLLOUT;

            if (0 > epoll_ctl(epoll_fd, EPOLL_CTL_ADD, s->sock, &event)) {
                cf_warning(CF_SOCKET, "epoll_ctl(ADD) of client socket failed (errno %d: \"%s\")", errno, cf_strerror(errno));
                goto Fail;
            }

            int tries = 0;
            do {
                int nevents = 0;
                int max_events = 1;
                int wait_ms = 1;
                struct epoll_event events[max_events];

                if (0 > (nevents = epoll_wait(epoll_fd, events, max_events, wait_ms))) {
                    if (errno == EINTR) {
                        cf_debug(CF_SOCKET, "epoll_wait() on client socket encountered EINTR ~~ Retrying!");
                        goto Retry;
                    } else {
                        cf_warning(CF_SOCKET, "epoll_wait() on client socket failed (errno %d: \"%s\") ~~ Failing!", errno, cf_strerror(errno));
                        goto Fail;
                    }
                } else {
                    if (nevents == 0) {
                        cf_debug(CF_SOCKET, "epoll_wait() returned no events ~~ Retrying!");
                        goto Retry;
                    }
                    if (nevents != 1) {
                        cf_warning(CF_SOCKET, "epoll_wait() returned %d events ~~ only 1 expected, so ignoring others!", nevents);
                    }
                    if (events[0].data.fd == s->sock) {
                        if (events[0].events & EPOLLOUT) {
                            cf_debug(CF_SOCKET, "epoll_wait() on client socket ready for write detected ~~ Succeeding!");
                        } else {
                            // (Note:  ERR and HUP events are automatically waited for as well.)
                            if (events[0].events & (EPOLLERR | EPOLLHUP)) {
                                cf_debug(CF_SOCKET, "epoll_wait() on client socket detected failure event 0x%x ~~ Failing!", events[0].events);
                            } else {
                                cf_warning(CF_SOCKET, "epoll_wait() on client socket detected non-write events 0x%x ~~ Failing!", events[0].events);
                            }
                            goto Fail;
                        }
                    } else {
                        cf_warning(CF_SOCKET, "epoll_wait() on client socket returned event on unknown socket %d ~~ Retrying!", events[0].data.fd);
                        goto Retry;
                    }
                    if (0 > epoll_ctl(epoll_fd, EPOLL_CTL_DEL, s->sock, &event)) {
                        cf_warning(CF_SOCKET, "epoll_ctl(DEL) on client socket failed (errno %d: \"%s\")", errno, cf_strerror(errno));
                    }
                    close(epoll_fd);
                    goto Success;
                }
Retry:
                cf_debug(CF_SOCKET, "Connect epoll loop:  Retry #%d", tries++);
                if (start + timeout < cf_getms()) {
                    cf_warning(CF_SOCKET, "Error in delayed connect() to %s:%d: timed out", s->addr, s->port);
                    errno = ETIMEDOUT;
                    goto Fail;
                }
            } while (1);
        }
Fail:
        cf_debug(CF_SOCKET, "connect failed to %s:%d : %s", s->addr, s->port, cf_strerror(errno));

        if (epoll_fd > 0) {
            close(epoll_fd);
        }

        close(s->sock);
        s->sock = -1;
        return(-1);
    } else {
        cf_debug(CF_SOCKET, "client socket connect() to %s:%d in 1 try!", s->addr, s->port);
    }
Success:
    ;
    // regarding this: calling here doesn't seem terribly effective.
    // on the fabric threads, it seems important to set no-delay much later
    int flag = 1;
    setsockopt(s->sock, SOL_TCP, TCP_NODELAY, &flag, sizeof(flag));
    long farg = fcntl(s->sock, F_GETFL, 0);
    fcntl(s->sock, F_SETFL, farg & (~O_NONBLOCK)); /* blocking again */

    return(0);
}
// Keep track of the connections, since they're precious. Kill anything that
// hasn't been used in a while. The file handle array keeps a reference count,
// and allows a reaper to run through and find the ones to reap. The table is
// only written by the demarshal threads, and only read by the reaper thread.
void *
thr_demarshal_reaper_fn(void *arg)
{
	uint64_t last = cf_getms();

	while (true) {
		uint64_t now = cf_getms();
		uint inuse_cnt = 0;
		uint64_t kill_ms = g_config.proto_fd_idle_ms;
		bool refresh = false;

		if (now - last > (uint64_t)(g_config.sec_cfg.privilege_refresh_period * 1000)) {
			refresh = true;
			last = now;
		}

		pthread_mutex_lock(&g_file_handle_a_LOCK);

		for (int i = 0; i < g_file_handle_a_sz; i++) {
			if (g_file_handle_a[i]) {
				as_file_handle *fd_h = g_file_handle_a[i];

				if (refresh) {
					as_security_refresh(fd_h);
				}

				// Reap, if asked to.
				if (fd_h->reap_me) {
					cf_debug(AS_DEMARSHAL, "Reaping FD %d as requested", fd_h->fd);
					g_file_handle_a[i] = 0;
					cf_queue_push(g_freeslot, &i);
					as_release_file_handle(fd_h);
					fd_h = 0;
				}
				// Reap if past kill time.
				else if ((0 != kill_ms) && (fd_h->last_used + kill_ms < now)) {
					if (fd_h->fh_info & FH_INFO_DONOT_REAP) {
						cf_debug(AS_DEMARSHAL, "Not reaping the fd %d as it has the protection bit set", fd_h->fd);
						inuse_cnt++;
						continue;
					}

					shutdown(fd_h->fd, SHUT_RDWR); // will trigger epoll errors
					cf_debug(AS_DEMARSHAL, "remove unused connection, fd %d", fd_h->fd);
					g_file_handle_a[i] = 0;
					cf_queue_push(g_freeslot, &i);
					as_release_file_handle(fd_h);
					fd_h = 0;
					cf_atomic_int_incr(&g_config.reaper_count);
				}
				else {
					inuse_cnt++;
				}
			}
		}

		pthread_mutex_unlock(&g_file_handle_a_LOCK);

		if ((g_file_handle_a_sz / 10) > (g_file_handle_a_sz - inuse_cnt)) {
			cf_warning(AS_DEMARSHAL, "less than ten percent file handles remaining: %d max %d inuse",
					g_file_handle_a_sz, inuse_cnt);
		}

		// Validate the system statistics.
		if (g_config.proto_connections_opened - g_config.proto_connections_closed != inuse_cnt) {
			cf_debug(AS_DEMARSHAL, "reaper: mismatched connection count: %d in stats vs %d calculated",
					g_config.proto_connections_opened - g_config.proto_connections_closed,
					inuse_cnt);
		}

		sleep(1);
	}

	return NULL;
}