Пример #1
0
/**
 * Initialize UDF from tr->udata. It is for internal UDF transactions
 * 
 * Returns:
 * 		0 on if found 
 * 		-1 if not found
 */
int
udf_rw_call_init_internal(udf_call * call, as_transaction * tr)
{
	udf_call *ucall = NULL;
	if (tr->udata.req_type == UDF_SCAN_REQUEST) {
		ucall = as_scan_get_udf_call(tr->udata.req_udata);
	} else if (tr->udata.req_type == UDF_QUERY_REQUEST) {
		ucall = as_query_get_udf_call(tr->udata.req_udata);
	}

	if (ucall) {
		strncpy (call->def.filename, ucall->def.filename, sizeof(ucall->def.filename));
		strncpy (call->def.function, ucall->def.function, sizeof(ucall->def.function));
		call->tr          = tr;
		call->def.arglist = ucall->def.arglist;
		call->def.type    = ucall->def.type;
		if (tr->udata.req_type == UDF_SCAN_REQUEST) {
			cf_atomic_int_incr(&g_config.udf_scan_rec_reqs);
		} else if (tr->udata.req_type == UDF_QUERY_REQUEST) {
			cf_atomic_int_incr(&g_config.udf_query_rec_reqs);
		}
		return 0;
	} 
	return -1;
}
Пример #2
0
// If there's an element with specified digest in the tree, return a locked
// and reserved reference to it in index_ref.
//
// Returns:
//		 0 - found (reference returned in index_ref)
//		-1 - not found (index_ref untouched)
int
as_index_get_vlock(as_index_tree *tree, cf_digest *keyd,
		as_index_ref *index_ref)
{
	pthread_mutex_lock(&tree->lock);

	int rv = as_index_search_lockless(tree, keyd, &index_ref->r,
			&index_ref->r_h);

	if (rv != 0) {
		pthread_mutex_unlock(&tree->lock);
		return rv;
	}

	as_index_reserve(index_ref->r);
	cf_atomic_int_incr(&g_config.global_record_ref_count);

	pthread_mutex_unlock(&tree->lock);

	if (! index_ref->skip_lock) {
		olock_vlock(g_config.record_locks, keyd, &index_ref->olock);
		cf_atomic_int_incr(&g_config.global_record_lock_count);
	}

	return 0;
}
Пример #3
0
void
as_transaction_error(as_transaction* tr, uint32_t error_code)
{
	if (tr->proto_fd_h) {
		if (tr->batch_shared) {
			as_batch_add_error(tr->batch_shared, tr->batch_index, error_code);
			// Clear this transaction's msgp so calling code does not free it.
			tr->msgp = 0;
		}
		else {
			as_msg_send_reply(tr->proto_fd_h, error_code, 0, 0, NULL, NULL, 0, NULL, NULL, as_transaction_trid(tr), NULL);
			tr->proto_fd_h = 0;
			MICROBENCHMARK_HIST_INSERT_P(error_hist);
			cf_atomic_int_incr(&g_config.err_tsvc_requests);
			if (error_code == AS_PROTO_RESULT_FAIL_TIMEOUT) {
				cf_atomic_int_incr(&g_config.err_tsvc_requests_timeout);
			}
		}
	}
	else if (tr->proxy_msg) {
		as_proxy_send_response(tr->proxy_node, tr->proxy_msg, error_code, 0, 0, NULL, NULL, 0, NULL, as_transaction_trid(tr), NULL);
		tr->proxy_msg = NULL;
	}
	else if (tr->udata.req_udata) {
		if (udf_rw_needcomplete(tr)) {
			udf_rw_complete(tr, error_code, __FILE__,__LINE__);
		}
	}
}
Пример #4
0
void as_proxy_set_stat_counters(int rv) {
	if (rv == 0) {
		cf_atomic_int_incr(&g_config.stat_proxy_success);
	}
	else {
		cf_atomic_int_incr(&g_config.stat_proxy_errs);
	}
}
Пример #5
0
/* Internal Function: Workhorse function to send response back to the client
 * 					  after UDF execution.
 *
 * caller:
 * 		send_success
 * 		send_failure
 *
 * Assumption: The call should be setup properly pointing to the tr.
 *
 * Special Handling: If it is background scan udf job do not sent any
 * 					 response to client
 * 					 If it is scan job ...do not cleanup the fd it will
 * 					 be done by the scan thread after scan is finished
 */
static int
send_response(udf_call *call, const char *key, size_t klen, int vtype, void *val,
			  size_t vlen)
{
	as_transaction *    tr          = call->transaction;
	as_namespace *      ns          = tr->rsv.ns;
	uint32_t            generation  = tr->generation;
	uint                sp_sz       = 1024 * 16;
	uint32_t            void_time   = 0;
	uint                written_sz  = 0;
	bool                keep_fd     = false;
	as_bin              stack_bin;
	as_bin            * bin         = &stack_bin;

	// space for the stack particles
	uint8_t             stack_particle_buf[sp_sz];
	uint8_t *           sp_p        = stack_particle_buf;

	if (call->udf_type == AS_SCAN_UDF_OP_BACKGROUND) {
		// If we are doing a background UDF scan, do not send any result back
		cf_detail(AS_UDF, "UDF: Background transaction, send no result back. "
				  "Parent job id [%"PRIu64"]", ((tscan_job*)(tr->udata.req_udata))->tid);
		if(strncmp(key, "FAILURE", 8) == 0)  {
			cf_atomic_int_incr(&((tscan_job*)(tr->udata.req_udata))->n_obj_udf_failed);
		} else if(strncmp(key, "SUCCESS", 8) == 0) {
			cf_atomic_int_incr(&((tscan_job*)(tr->udata.req_udata))->n_obj_udf_success);
		}
		return 0;
	} else if(call->udf_type == AS_SCAN_UDF_OP_UDF) {
		// Do not release fd now, scan will do it at the end of all internal
		// 	udf transactions
		cf_detail(AS_UDF, "UDF: Internal udf transaction, do not release fd");
		keep_fd = true;
	}

	if (0 != make_send_bin(ns, bin, &sp_p, sp_sz, key, klen, vtype, val, vlen)) {
		return(-1);
	}

	// this is going to release the file descriptor
	if (keep_fd && tr->proto_fd_h) cf_rc_reserve(tr->proto_fd_h);

	single_transaction_response(
		tr, ns, NULL/*ops*/, &bin, 1,
		generation, void_time, &written_sz, NULL);

	// clean up.
	// TODO: check: is bin_inuse valid only when data_in_memory?
	// There must be another way to determine if the particle is used?
	if ( as_bin_inuse(bin) ) {
		as_particle_destroy(&stack_bin, ns->storage_data_in_memory);
	}

	if (sp_p != stack_particle_buf) {
		cf_free(sp_p);
	}
	return 0;
} // end send_response()
Пример #6
0
void
as_index_reduce_traverse(as_index_tree *tree, cf_arenax_handle r_h,
		cf_arenax_handle sentinel_h, as_index_ph_array *v_a)
{
	if (v_a->pos >= v_a->alloc_sz) {
		return;
	}

	as_index *r = RESOLVE_H(r_h);

	if (r->left_h != sentinel_h) {
		as_index_reduce_traverse(tree, r->left_h, sentinel_h, v_a);
	}

	as_index_reserve(r);
	cf_atomic_int_incr(&g_config.global_record_ref_count);

	v_a->indexes[v_a->pos].r = r;
	v_a->indexes[v_a->pos].r_h = r_h;
	v_a->pos++;

	if (r->right_h != sentinel_h) {
		as_index_reduce_traverse(tree, r->right_h, sentinel_h, v_a);
	}
}
Пример #7
0
// Process one queue's batch requests.
void*
batch_process_queue(void* q_to_wait_on)
{
	cf_queue* worker_queue = (cf_queue*)q_to_wait_on;
	batch_transaction btr;
	uint64_t start;

	while (1) {
		if (cf_queue_pop(worker_queue, &btr, CF_QUEUE_FOREVER) != 0) {
			cf_crash(AS_BATCH, "Failed to pop from batch worker queue.");
		}

		// Check for timeouts.
		if (btr.end_time != 0 && cf_getns() > btr.end_time) {
			cf_atomic_int_incr(&g_config.batch_timeout);

			if (btr.fd_h) {
				as_msg_send_reply(btr.fd_h, AS_PROTO_RESULT_FAIL_TIMEOUT,
						0, 0, 0, 0, 0, 0, 0, btr.trid, NULL);
				btr.fd_h = 0;
			}
			batch_transaction_done(&btr);
			continue;
		}

		// Process batch request.
		start = cf_getns();
		batch_process_request(&btr);
		histogram_insert_data_point(g_config.batch_q_process_hist, start);
	}

	return 0;
}
Пример #8
0
// TODO - deprecate this when swap is moved out into thr_demarshal.c!
void
as_transaction_error_unswapped(as_transaction* tr, uint32_t error_code)
{
	if (tr->batch_shared) {
		as_batch_add_error(tr->batch_shared, tr->batch_index, error_code);
		// Clear this transaction's msgp so calling code does not free it.
		tr->msgp = 0;
	}
	else {
		as_msg_send_reply(tr->proto_fd_h, error_code, 0, 0, NULL, NULL, 0, NULL, NULL, 0, NULL);
		tr->proto_fd_h = 0;
		MICROBENCHMARK_HIST_INSERT_P(error_hist);
		cf_atomic_int_incr(&g_config.err_tsvc_requests);
		if (error_code == AS_PROTO_RESULT_FAIL_TIMEOUT) {
			cf_atomic_int_incr(&g_config.err_tsvc_requests_timeout);
		}
	}
}
Пример #9
0
/**
 * Get UDF call object pointer from parent job via tr->from.iudf_orig.
 */
udf_call *
udf_rw_call_def_init_internal(udf_call * call, as_transaction * tr)
{
	// TODO - wouldn't need this if we bailed early on losing race vs. timeout.
	if (! tr->from.iudf_orig) {
		return NULL; // can happen on timeout
	}

	call->def = &tr->from.iudf_orig->def;

	if (tr->from.iudf_orig->type == UDF_SCAN_REQUEST) {
		cf_atomic_int_incr(&g_config.udf_scan_rec_reqs);
	}
	else if (tr->from.iudf_orig->type == UDF_QUERY_REQUEST) {
		cf_atomic_int_incr(&g_config.udf_query_rec_reqs);
	}

	return call;
}
Пример #10
0
/*
 * Create a tree "stub" for the storage has index case.
 * Returns:  1 = new
 *           0 = success (found)
 *          -1 = fail
 */
int
as_index_ref_initialize(as_index_tree *tree, cf_digest *key, as_index_ref *index_ref, bool create_p, as_namespace *ns)
{
	/* Allocate memory for the new node and set the node parameters */
	cf_arenax_handle n_h = cf_arenax_alloc(tree->arena);
	if (0 == n_h) {
		// cf_debug(AS_INDEX," malloc failed ");
		return(-1);
	}
	as_index *n = RESOLVE_H(n_h);
	n->key = *key;
	n->rc = 1;
	n->left_h = n->right_h = tree->sentinel_h;
	n->color = AS_RED;
	n->parent_h = tree->sentinel_h;

	if (AS_STORAGE_ENGINE_KV == ns->storage_type)
		n->storage_key.kv.file_id = STORAGE_INVALID_FILE_ID; // careful here - this is now unsigned
	else
		cf_crash(AS_INDEX, "non-KV storage type ns %s key %p", ns->name, key);

	index_ref->r = n;
	index_ref->r_h = n_h;
	if (!index_ref->skip_lock) {
		olock_vlock(g_config.record_locks, key, &(index_ref->olock));
		cf_atomic_int_incr(&g_config.global_record_lock_count);
	}
	as_index_reserve(n);
	cf_atomic_int_add(&g_config.global_record_ref_count, 2);

	int rv = !as_storage_record_exists(ns, key);

	// Unlock if not found and we're not creating it.
	if (rv && !create_p) {
		if (!index_ref->skip_lock) {
			pthread_mutex_unlock(index_ref->olock);
			cf_atomic_int_decr(&g_config.global_record_lock_count);
		}
		as_index_release(n);
		cf_atomic_int_decr(&g_config.global_record_ref_count);
		cf_arenax_free(tree->arena, n_h);
		index_ref->r = 0;
		index_ref->r_h = 0;
	}

	return(rv);
}
Пример #11
0
// TODO - really? we can't hide this behind an XDR stub?
bool
xdr_allows_write(as_transaction* tr)
{
	if (as_transaction_is_xdr(tr)) {
		if (tr->rsv.ns->ns_allow_xdr_writes) {
			return true;
		}
	}
	else {
		if (tr->rsv.ns->ns_allow_nonxdr_writes) {
			return true;
		}
	}

	cf_atomic_int_incr(&tr->rsv.ns->n_fail_xdr_forbidden);

	return false;
}
Пример #12
0
Файл: act.c Проект: aanguss/act
//------------------------------------------------
// Runs in thr_add_readreqs, adds readreq objects
// to all read queues in an even, random spread.
//
static void* run_add_readreqs(void* pv_unused) {
	uint64_t count = 0;

	while (g_running) {
		if (cf_atomic_int_incr(&g_read_reqs_queued) > MAX_READ_REQS_QUEUED) {
			fprintf(stdout, "ERROR: too many read reqs queued\n");
			fprintf(stdout, "drive(s) can't keep up - test stopped\n");
			g_running = false;
			break;
		}

		uint32_t random_queue_index = rand_32() % g_num_queues;
		uint32_t random_device_index =
			g_queue_per_device ? random_queue_index : rand_32() % g_num_devices;

		device* p_random_device = &g_devices[random_device_index];
		readreq* p_readreq = malloc(sizeof(readreq));

		p_readreq->p_device = p_random_device;
		p_readreq->offset = random_read_offset(p_random_device);
		p_readreq->size = g_read_req_num_512_blocks * MIN_BLOCK_BYTES;
		p_readreq->start_time = cf_getus();

		cf_queue_push(g_readqs[random_queue_index].p_req_queue, &p_readreq);

		count++;

		int sleep_us = (int)
			(((count * 1000000) / g_read_reqs_per_sec) -
				(cf_getus() - g_run_start_us));

		if (sleep_us > 0) {
			usleep((uint32_t)sleep_us);
		}

		if (sleep_us != 0) {
			fprintf(stdout, "%" PRIu64 ", sleep_us = %d\n", count, sleep_us);
		}
	}

	return (0);
}
Пример #13
0
// Process one queue's batch requests.
static void
batch_worker(void* udata)
{
	batch_transaction* btr = (batch_transaction*)udata;
	
	// Check for timeouts.
	if (btr->end_time != 0 && cf_getns() > btr->end_time) {
		cf_atomic_int_incr(&g_config.batch_timeout);

		if (btr->fd_h) {
			as_msg_send_reply(btr->fd_h, AS_PROTO_RESULT_FAIL_TIMEOUT,
					0, 0, 0, 0, 0, 0, 0, btr->trid, NULL);
			btr->fd_h = 0;
		}
		batch_transaction_done(btr, false);
		return;
	}
	
	// Process batch request.
	uint64_t start = cf_getns();
	batch_process_request(btr);
	histogram_insert_data_point(g_config.batch_q_process_hist, start);	
}
Пример #14
0
// Helper to release transaction file handles.
void
as_release_file_handle(as_file_handle *proto_fd_h)
{
	int rc = cf_rc_release(proto_fd_h);

	if (rc > 0) {
		return;
	}
	else if (rc < 0) {
		cf_warning(AS_PROTO, "release file handle: negative ref-count %d", rc);
		return;
	}

	close(proto_fd_h->fd);
	proto_fd_h->fh_info &= ~FH_INFO_DONOT_REAP;
	proto_fd_h->fd = -1;

	if (proto_fd_h->proto)	{
		as_proto *p = proto_fd_h->proto;

		if ((p->version != PROTO_VERSION) || (p->type >= PROTO_TYPE_MAX)) {
			cf_warning(AS_PROTO, "release file handle: bad proto buf, corruption");
		}
		else {
			cf_free(proto_fd_h->proto);
			proto_fd_h->proto = NULL;
		}
	}

	if (proto_fd_h->security_filter) {
		as_security_filter_destroy(proto_fd_h->security_filter);
		proto_fd_h->security_filter = NULL;
	}

	cf_rc_free(proto_fd_h);
	cf_atomic_int_incr(&g_config.proto_connections_closed);
}
Пример #15
0
static void* generate_async_reads(void* aio_context)
{
	uint64_t count = 0;
	while(g_running)
	{
		/* Create the struct of info needed at the process_read end */
		uintptr_t info_ptr;
		if (cf_queue_pop(async_info_queue, (void*)&info_ptr, CF_QUEUE_NOWAIT) !=
				CF_QUEUE_OK) 
		{
			fprintf(stdout, "Error: Could not pop info struct \n");
			return (void*)(-1);
		}
		as_async_info_t *info = (as_async_info_t*)info_ptr;
		memset(info, 0, sizeof(as_async_info_t));
		/* Generate the actual read request */
		uint32_t random_device_index = rand_32() % g_num_devices;
		device* p_random_device = &g_devices[random_device_index];
		readreq* p_readreq = &(info->p_readreq);
		if(p_readreq == NULL)
		{
			fprintf(stdout, "Error: preadreq null \n");
			goto fail;
		}
		p_readreq->p_device = p_random_device;
		p_readreq->offset = random_read_offset(p_random_device);
		p_readreq->size = g_read_req_num_512_blocks * MIN_BLOCK_BYTES;
		p_readreq->start_time = cf_getms();

		/* Async read */
		if (g_use_valloc) 
		{
			uint8_t* p_buffer = cf_valloc(p_readreq->size);
			info->p_buffer = p_buffer;
			if (p_buffer) 
			{
				uint64_t raw_start_time = cf_getms();
				info->raw_start_time = raw_start_time;
				if(read_async_from_device(info, *(aio_context_t *)aio_context) < 0)
				{
					fprintf(stdout, "Error: Async read failed \n");
					free(p_buffer);
					goto fail; 
				}
			}
			else 
			{
				fprintf(stdout, "ERROR: read buffer cf_valloc()\n");
			}
		}
		else 
		{
			uint8_t stack_buffer[p_readreq->size + 4096];
			uint8_t* p_buffer = align_4096(stack_buffer);
			info->p_buffer = p_buffer;
			uint64_t raw_start_time = cf_getms();
			info->raw_start_time = raw_start_time;
			if(read_async_from_device(info, *(aio_context_t*)aio_context) < 0)
			{
				fprintf(stdout, "Error: Async read failed \n");
				goto fail;
			}
		}
		if (cf_atomic_int_incr(&g_read_reqs_queued) > MAX_READ_REQS_QUEUED) 
		{
		  fprintf(stdout, "ERROR: too many read reqs queued\n");
		  fprintf(stdout, "drive(s) can't keep up - test stopped\n");
		  g_running = false;
		  return (void*)-1;;
		}

		count++;

		int sleep_ms = (int)
			(((count * 1000) / g_read_reqs_per_sec) -
				(cf_getms() - g_run_start_ms));

		if (sleep_ms > 0) {
			usleep((uint32_t)sleep_ms * 1000);
		}

		continue;

		/* Rollback for failure */
fail:
		if(info)
		{
			uintptr_t temp = (uintptr_t)info;
			cf_queue_push(async_info_queue, (void*)&temp);
		}
	}
	return (0);
}
Пример #16
0
// If there's an element with specified digest in the tree, return a locked
// and reserved reference to it in index_ref. If not, create an element with
// this digest, insert it into the tree, and return a locked and reserved
// reference to it in index_ref.
//
// Returns:
//		 1 - created and inserted (reference returned in index_ref)
//		 0 - found already existing (reference returned in index_ref)
//		-1 - error (index_ref untouched)
int
as_index_get_insert_vlock(as_index_tree *tree, cf_digest *keyd,
		as_index_ref *index_ref)
{
	int cmp = 0;
	bool retry;

	// Save parents as we search for the specified element's insertion point.
	as_index_ele eles[64];
	as_index_ele *ele;

	do {
		ele = eles;

		pthread_mutex_lock(&tree->lock);

		// Search for the specified element, or a parent to insert it under.

		ele->parent = NULL; // we'll never look this far up
		ele->me_h = tree->root_h;
		ele->me = tree->root;

		cf_arenax_handle t_h = tree->root->left_h;
		as_index *t = RESOLVE_H(t_h);

		while (t_h != tree->sentinel_h) {
			ele++;
			ele->parent = ele - 1;
			ele->me_h = t_h;
			ele->me = t;

			if ((cmp = cf_digest_compare(keyd, &t->key)) == 0) {
				// The element already exists, simply return it.

				as_index_reserve(t);
				cf_atomic_int_incr(&g_config.global_record_ref_count);

				pthread_mutex_unlock(&tree->lock);

				if (! index_ref->skip_lock) {
					olock_vlock(g_config.record_locks, keyd, &index_ref->olock);
					cf_atomic_int_incr(&g_config.global_record_lock_count);
				}

				index_ref->r = t;
				index_ref->r_h = t_h;

				return 0;
			}

			t_h = cmp > 0 ? t->left_h : t->right_h;
			t = RESOLVE_H(t_h);
		}

		// We didn't find the tree element, so we'll be inserting it.

		retry = false;

		if (EBUSY == pthread_mutex_trylock(&tree->reduce_lock)) {
			// The tree is being reduced - could take long, unlock so reads and
			// overwrites aren't blocked.
			pthread_mutex_unlock(&tree->lock);

			// Wait until the tree reduce is done...
			pthread_mutex_lock(&tree->reduce_lock);
			pthread_mutex_unlock(&tree->reduce_lock);

			// ... and start over - we unlocked, so the tree may have changed.
			retry = true;
		}
	} while (retry);

	// Create a new element and insert it.

	// Make the new element.
	cf_arenax_handle n_h = cf_arenax_alloc(tree->arena);

	if (n_h == 0) {
		cf_warning(AS_INDEX, "arenax alloc failed");
		pthread_mutex_unlock(&tree->reduce_lock);
		pthread_mutex_unlock(&tree->lock);
		return -1;
	}

	as_index *n = RESOLVE_H(n_h);

	n->rc = 2; // one for create (eventually balanced by delete), one for caller
	cf_atomic_int_add(&g_config.global_record_ref_count, 2);

	n->key = *keyd;

	n->left_h = n->right_h = tree->sentinel_h; // n starts as a leaf element
	n->color = AS_RED; // n's color starts as red

	// Make sure we can detect that the record isn't initialized.
	as_index_clear_record_info(n);

	// Insert the new element n under parent ele.
	if (ele->me == tree->root || 0 < cmp) {
		ele->me->left_h = n_h;
	}
	else {
		ele->me->right_h = n_h;
	}

	ele++;
	ele->parent = ele - 1;
	ele->me_h = n_h;
	ele->me = n;

	// Rebalance the tree as needed.
	as_index_insert_rebalance(tree, ele);

	tree->elements++;

	pthread_mutex_unlock(&tree->reduce_lock);
	pthread_mutex_unlock(&tree->lock);

	if (! index_ref->skip_lock) {
		olock_vlock(g_config.record_locks, keyd, &index_ref->olock);
		cf_atomic_int_incr(&g_config.global_record_lock_count);
	}

	index_ref->r = n;
	index_ref->r_h = n_h;

	return 1;
}
Пример #17
0
// Keep track of the connections, since they're precious. Kill anything that
// hasn't been used in a while. The file handle array keeps a reference count,
// and allows a reaper to run through and find the ones to reap. The table is
// only written by the demarshal threads, and only read by the reaper thread.
void *
thr_demarshal_reaper_fn(void *arg)
{
	uint64_t last = cf_getms();

	while (true) {
		uint64_t now = cf_getms();
		uint inuse_cnt = 0;
		uint64_t kill_ms = g_config.proto_fd_idle_ms;
		bool refresh = false;

		if (now - last > (uint64_t)(g_config.sec_cfg.privilege_refresh_period * 1000)) {
			refresh = true;
			last = now;
		}

		pthread_mutex_lock(&g_file_handle_a_LOCK);

		for (int i = 0; i < g_file_handle_a_sz; i++) {
			if (g_file_handle_a[i]) {
				as_file_handle *fd_h = g_file_handle_a[i];

				if (refresh) {
					as_security_refresh(fd_h);
				}

				// Reap, if asked to.
				if (fd_h->reap_me) {
					cf_debug(AS_DEMARSHAL, "Reaping FD %d as requested", fd_h->fd);
					g_file_handle_a[i] = 0;
					cf_queue_push(g_freeslot, &i);
					as_release_file_handle(fd_h);
					fd_h = 0;
				}
				// Reap if past kill time.
				else if ((0 != kill_ms) && (fd_h->last_used + kill_ms < now)) {
					if (fd_h->fh_info & FH_INFO_DONOT_REAP) {
						cf_debug(AS_DEMARSHAL, "Not reaping the fd %d as it has the protection bit set", fd_h->fd);
						inuse_cnt++;
						continue;
					}

					shutdown(fd_h->fd, SHUT_RDWR); // will trigger epoll errors
					cf_debug(AS_DEMARSHAL, "remove unused connection, fd %d", fd_h->fd);
					g_file_handle_a[i] = 0;
					cf_queue_push(g_freeslot, &i);
					as_release_file_handle(fd_h);
					fd_h = 0;
					cf_atomic_int_incr(&g_config.reaper_count);
				}
				else {
					inuse_cnt++;
				}
			}
		}

		pthread_mutex_unlock(&g_file_handle_a_LOCK);

		if ((g_file_handle_a_sz / 10) > (g_file_handle_a_sz - inuse_cnt)) {
			cf_warning(AS_DEMARSHAL, "less than ten percent file handles remaining: %d max %d inuse",
					g_file_handle_a_sz, inuse_cnt);
		}

		// Validate the system statistics.
		if (g_config.proto_connections_opened - g_config.proto_connections_closed != inuse_cnt) {
			cf_debug(AS_DEMARSHAL, "reaper: mismatched connection count: %d in stats vs %d calculated",
					g_config.proto_connections_opened - g_config.proto_connections_closed,
					inuse_cnt);
		}

		sleep(1);
	}

	return NULL;
}
Пример #18
0
// Make a callback for a specified number of elements in the tree, from outside
// the tree lock.
void
as_index_reduce_partial(as_index_tree *tree, uint32_t sample_count,
		as_index_reduce_fn cb, void *udata)
{
	pthread_mutex_lock(&tree->reduce_lock);

	// For full reduce, get the number of elements inside the tree lock.
	if (sample_count == AS_REDUCE_ALL) {
		sample_count = tree->elements;
	}

	if (sample_count == 0) {
		pthread_mutex_unlock(&tree->reduce_lock);
		return;
	}

	size_t sz = sizeof(as_index_ph_array) +
			(sizeof(as_index_ph) * sample_count);
	as_index_ph_array *v_a;
	uint8_t buf[64 * 1024];

	if (sz > 64 * 1024) {
		v_a = cf_malloc(sz);

		if (! v_a) {
			pthread_mutex_unlock(&tree->reduce_lock);
			return;
		}
	}
	else {
		v_a = (as_index_ph_array*)buf;
	}

	v_a->alloc_sz = sample_count;
	v_a->pos = 0;

	uint64_t start_ms = cf_getms();

	// Recursively, fetch all the value pointers into this array, so we can make
	// all the callbacks outside the big lock.
	if (tree->root->left_h != tree->sentinel_h) {
		as_index_reduce_traverse(tree, tree->root->left_h, tree->sentinel_h,
				v_a);
	}

	cf_debug(AS_INDEX, "as_index_reduce_traverse took %"PRIu64" ms",
			cf_getms() - start_ms);

	pthread_mutex_unlock(&tree->reduce_lock);

	for (uint32_t i = 0; i < v_a->pos; i++) {
		as_index_ref r_ref;

		r_ref.skip_lock = false;
		r_ref.r = v_a->indexes[i].r;
		r_ref.r_h = v_a->indexes[i].r_h;

		olock_vlock(g_config.record_locks, &r_ref.r->key, &r_ref.olock);
		cf_atomic_int_incr(&g_config.global_record_lock_count);

		// Callback MUST call as_record_done() to unlock and release record.
		cb(&r_ref, udata);
	}

	if (v_a != (as_index_ph_array*)buf) {
		cf_free(v_a);
	}
}
Пример #19
0
int
as_proxy_shipop(cf_node dst, write_request *wr)
{
	as_partition_id pid = as_partition_getid(wr->keyd);

	if (dst == 0) {
		cf_crash(AS_PROXY, "the destination should never be zero");
	}

	// Create a fabric message, fill it out.
	msg *m = as_fabric_msg_get(M_TYPE_PROXY);
	if (!m)	{
		return -1;
	}

	uint32_t tid = cf_atomic32_incr(&g_proxy_tid);

	msg_set_uint32(m, PROXY_FIELD_OP, PROXY_OP_REQUEST);
	msg_set_uint32(m, PROXY_FIELD_TID, tid);
	msg_set_buf(m, PROXY_FIELD_DIGEST, (void *) &wr->keyd, sizeof(cf_digest), MSG_SET_COPY);
	msg_set_buf(m, PROXY_FIELD_AS_PROTO, (void *) wr->msgp, as_proto_size_get(&wr->msgp->proto), MSG_SET_HANDOFF_MALLOC);
	msg_set_uint64(m, PROXY_FIELD_CLUSTER_KEY, as_paxos_get_cluster_key());
	msg_set_uint32(m, PROXY_FIELD_TIMEOUT_MS, wr->msgp->msg.transaction_ttl);
	wr->msgp = 0;

	// If it is shipped op.
	uint32_t info = 0;
	info |= PROXY_INFO_SHIPPED_OP;
	msg_set_uint32(m, PROXY_FIELD_INFO, info);

	cf_detail_digest(AS_PROXY, &wr->keyd, "SHIPPED_OP %s->WINNER msg %p Proxy Sent to %"PRIx64" %p tid(%d)",
			wr->proxy_msg ? "NONORIG" : "ORIG", m, dst, wr, tid);

	// Fill out a retransmit structure, insert into the retransmit hash.
	msg_incr_ref(m);
	proxy_request pr;
	pr.start_time  = wr->start_time;
	pr.end_time    = (wr->end_time != 0) ? wr->end_time : pr.start_time + g_config.transaction_max_ns;
	cf_rc_reserve(wr);
	pr.wr          = wr;
	pr.fab_msg     = m;
	pr.xmit_ms     = cf_getms() + g_config.transaction_retry_ms;
	pr.retry_interval_ms = g_config.transaction_retry_ms;
	pr.dest        = dst;
	pr.pid         = pid;
	pr.fd_h        = NULL;
	pr.batch_shared = NULL;
	pr.batch_index = 0;

	if (0 != shash_put(g_proxy_hash, &tid, &pr)) {
		cf_info(AS_PROXY, " shash_put failed, need cleanup code");
		return -1;
	}

	// Send to the remote node.
	int rv = as_fabric_send(dst, m, AS_FABRIC_PRIORITY_MEDIUM);
	if (rv != 0) {
		cf_detail(AS_PROXY, "SHIPPED_OP ORIG [Digest %"PRIx64"] Failed with %d", *(uint64_t *)&wr->keyd, rv);
		as_fabric_msg_put(m);
	}

	wr->shipped_op_initiator = true;
	cf_atomic_int_incr(&g_config.ldt_proxy_initiate);

	return 0;
}
Пример #20
0
// Incoming messages start here.
// - Could get a request that we need to service.
// - Could get a response to one of our requests - need to find the request and
//   send the real response to the remote end.
int
proxy_msg_fn(cf_node id, msg *m, void *udata)
{
	int rv;

	if (cf_rc_count((void*)m) == 0) {
		cf_debug(AS_PROXY, " proxy_msg_fn was given a refcount 0 message! Someone has been naugty %p", m);
		return -1;
	}

	uint32_t op = 99999;
	msg_get_uint32(m, PROXY_FIELD_OP, &op);
	uint32_t transaction_id = 0;
	msg_get_uint32(m, PROXY_FIELD_TID, &transaction_id);

	cf_detail(AS_PROXY, "received proxy message: tid %d type %d from %"PRIx64, transaction_id, op, id);

	switch (op) {
		case PROXY_OP_REQUEST:
		{
			cf_atomic_int_incr(&g_config.proxy_action);

#ifdef DEBUG
			cf_debug(AS_PROXY, "Proxy_msg: received request");
#ifdef DEBUG_VERBOSE
			msg_dump(m, "incoming proxy msg");
#endif
#endif
			cf_digest *key;
			size_t sz = 0;
			if (0 != msg_get_buf(m, PROXY_FIELD_DIGEST, (byte **) &key, &sz, MSG_GET_DIRECT)) {
				cf_info(AS_PROXY, "proxy msg function: no digest, problem");
				as_fabric_msg_put(m);
				return 0;
			}
			cl_msg *msgp;
			size_t as_msg_sz = 0;
			if (0 != msg_get_buf(m, PROXY_FIELD_AS_PROTO, (byte **) &msgp, &as_msg_sz, MSG_GET_COPY_MALLOC)) {
				cf_info(AS_PROXY, "proxy msg function: no as msg, problem");
				as_fabric_msg_put(m);
				return 0;
			}

			uint64_t cluster_key = 0;
			if (0 != msg_get_uint64(m, PROXY_FIELD_CLUSTER_KEY, &cluster_key)) {
				cf_info(AS_PROXY, "proxy msg function: no cluster key, problem");
				as_fabric_msg_put(m);
				return 0;
			}

			// This is allowed to fail - this is a new field, and gets defaulted
			// to 0 if it doesn't exist.
			uint32_t timeout_ms = 0;
			msg_get_uint32(m, PROXY_FIELD_TIMEOUT_MS, &timeout_ms);
//			cf_info(AS_PROXY, "proxy msg: received timeout_ms of %d",timeout_ms);

			// Put the as_msg on the normal queue for processing.
			// INIT_TR
			as_transaction tr;
			as_transaction_init(&tr, key, msgp);
			tr.incoming_cluster_key = cluster_key;
			tr.end_time             = (timeout_ms != 0) ? ((uint64_t)timeout_ms * 1000000) + tr.start_time : 0;
			tr.proxy_node           = id;
			tr.proxy_msg            = m;

			// Check here if this is shipped op.
			uint32_t info = 0;
			msg_get_uint32(m, PROXY_FIELD_INFO, &info);
			if (info & PROXY_INFO_SHIPPED_OP) {
				tr.flag |= AS_TRANSACTION_FLAG_SHIPPED_OP;
				cf_detail_digest(AS_PROXY, &tr.keyd, "SHIPPED_OP WINNER Operation Received");
			} else {
				cf_detail_digest(AS_PROXY, &tr.keyd, "Received Proxy Request digest tid(%d)", tr.trid);
			}

			MICROBENCHMARK_RESET();

			thr_tsvc_enqueue(&tr);
		}
		break;

		case PROXY_OP_RESPONSE:
		{
#ifdef DEBUG
			// Got the response from the actual endpoint.
			cf_debug(AS_PROXY, " proxy: received response! tid %d node %"PRIx64, transaction_id, id);
#ifdef DEBUG_VERBOSE
			msg_dump(m, "incoming proxy response");
#endif
#endif

			// Look up the element.
			proxy_request pr;
			bool free_msg = true;
			if (SHASH_OK == shash_get_and_delete(g_proxy_hash, &transaction_id, &pr)) {
				// Found the element (sometimes we get two acks so it's OK for
				// an ack to not find the transaction).

				if (pr.wr) {
					as_proxy_shipop_response_hdlr(m, &pr, &free_msg);
				} else {
					as_proto *proto;
					size_t proto_sz;
					if (0 != msg_get_buf(m, PROXY_FIELD_AS_PROTO, (byte **) &proto, &proto_sz, MSG_GET_DIRECT)) {
						cf_info(AS_PROXY, "msg get buf failed!");
					}

#ifdef DEBUG_VERBOSE
					cf_debug(AS_PROXY, "proxy: sending proto response: ptr %p sz %"PRIu64" %d", proto, proto_sz, pr.fd);
					for (size_t _i = 0; _i < proto_sz; _i++) {
						fprintf(stderr, " %x", ((byte *)proto)[_i]);
						if (_i % 16 == 15) {
							fprintf(stderr, "\n");
						}
					}
#endif

#ifdef EXTRA_CHECKS
					as_proto proto_copy = *proto;
					as_proto_swap(&proto_copy);
					if (proto_copy.sz + 8 != proto_sz) {
						cf_info(AS_PROXY, "BONE BONE BONE!!!");
						cf_info(AS_PROXY, "proto sz: %"PRIu64" sz %u", (uint64_t) proto_copy.sz, proto_sz);
					}
#endif

					// Write to the file descriptor.
					cf_detail(AS_PROXY, "direct write fd %d", pr.fd_h->fd);
					cf_assert(pr.fd_h->fd, AS_PROXY, CF_WARNING, "attempted write to fd 0");

					if (pr.batch_shared) {
						cf_digest* digest;
						size_t digest_sz = 0;

						if (msg_get_buf(pr.fab_msg, PROXY_FIELD_DIGEST, (byte **)&digest, &digest_sz, MSG_GET_DIRECT) == 0) {
							as_batch_add_proxy_result(pr.batch_shared, pr.batch_index, digest, (cl_msg*)proto, proto_sz);
							as_proxy_set_stat_counters(0);
						}
						else {
							cf_warning(AS_PROXY, "Failed to find batch proxy digest %u", transaction_id);
							as_batch_add_error(pr.batch_shared, pr.batch_index, AS_PROTO_RESULT_FAIL_UNKNOWN);
							as_proxy_set_stat_counters(-1);
						}
						cf_hist_track_insert_data_point(g_config.px_hist, pr.start_time);
					}
					else {
						size_t pos = 0;
						while (pos < proto_sz) {
							rv = send(pr.fd_h->fd, (((uint8_t *)proto) + pos), proto_sz - pos, MSG_NOSIGNAL);
							if (rv > 0) {
								pos += rv;
							}
							else if (rv < 0) {
								if (errno != EWOULDBLOCK) {
									// Common message when a client aborts.
									cf_debug(AS_PROTO, "protocol proxy write fail: fd %d sz %d pos %d rv %d errno %d", pr.fd_h->fd, proto_sz, pos, rv, errno);
									shutdown(pr.fd_h->fd, SHUT_RDWR);
									as_proxy_set_stat_counters(-1);
									goto SendFin;
								}
								usleep(1); // yield
							}
							else {
								cf_info(AS_PROTO, "protocol write fail zero return: fd %d sz %d pos %d ", pr.fd_h->fd, proto_sz, pos);
								shutdown(pr.fd_h->fd, SHUT_RDWR);
								as_proxy_set_stat_counters(-1);
								goto SendFin;
							}
						}
						as_proxy_set_stat_counters(0);
SendFin:
						cf_hist_track_insert_data_point(g_config.px_hist, pr.start_time);

						// Return the fabric message or the direct file descriptor -
						// after write and complete.
						pr.fd_h->t_inprogress = false;
						AS_RELEASE_FILE_HANDLE(pr.fd_h);
						pr.fd_h = 0;
					}
					as_fabric_msg_put(pr.fab_msg);
					pr.fab_msg = 0;
				}
			}
			else {
				cf_debug(AS_PROXY, "proxy: received result but no transaction, tid %d", transaction_id);
				as_proxy_set_stat_counters(-1);
			}

			if (free_msg) {
				as_fabric_msg_put(m);
			}
		}
		break;

		case PROXY_OP_REDIRECT:
		{
			// Sometimes the destination we proxied a request to isn't able to
			// satisfy it (for example, their copy of the partition in question
			// might be desync).
			cf_node new_dst = 0;
			msg_get_uint64(m, PROXY_FIELD_REDIRECT, &new_dst);
			cf_detail(AS_PROXY, "proxy redirect message: transaction %d to node %"PRIx64, transaction_id, new_dst);

			// Look in the proxy retransmit hash for the tid.
			proxy_request *pr;
			pthread_mutex_t *pr_lock;
			int r = 0;
			if (0 != (r = shash_get_vlock(g_proxy_hash, &transaction_id, (void **)&pr, &pr_lock))) {
				cf_debug(AS_PROXY, "redirect: could not find transaction %d", transaction_id);
				as_fabric_msg_put(m);
				return -1;
			}

			if (g_config.self_node == new_dst) {

				// Although we don't know we're the final destination, undo the
				// proxy-nature and put back on the main queue. Dangerous, as it
				// leaves open the possibility of a looping message.

				cf_digest *key;
				size_t sz = 0;
				if (0 != msg_get_buf(pr->fab_msg, PROXY_FIELD_DIGEST, (byte **) &key, &sz, MSG_GET_DIRECT)) {
					cf_warning(AS_PROXY, "op_redirect: proxy msg function: no digest, problem");
					pthread_mutex_unlock(pr_lock);
					as_fabric_msg_put(m);
					return -1;
				}

				cl_msg *msgp;
				sz = 0;
				if (0 != msg_get_buf(pr->fab_msg, PROXY_FIELD_AS_PROTO, (byte **) &msgp, &sz, MSG_GET_COPY_MALLOC)) {
					cf_warning(AS_PROXY, "op_redirect: proxy msg function: no as proto, problem");
					pthread_mutex_unlock(pr_lock);
					as_fabric_msg_put(m);
					return -1;
				}

				// Put the as_msg on the normal queue for processing.
				// INIT_TR
				as_transaction tr;
				as_transaction_init(&tr, key, msgp);
				tr.start_time = pr->start_time; // start time
				tr.end_time   = pr->end_time;
				tr.proto_fd_h = pr->fd_h;
				tr.batch_shared = pr->batch_shared;
				tr.batch_index = pr->batch_index;

				MICROBENCHMARK_RESET();

				thr_tsvc_enqueue(&tr);

				as_fabric_msg_put(pr->fab_msg);
				shash_delete_lockfree(g_proxy_hash, &transaction_id);
			}
			else {
				// Change the destination, update the retransmit time.
				pr->dest = new_dst;
				pr->xmit_ms = cf_getms() + 1;

				// Send it.
				msg_incr_ref(pr->fab_msg);
				if (0 != (rv = as_fabric_send(pr->dest, pr->fab_msg, AS_FABRIC_PRIORITY_MEDIUM))) {
					cf_debug(AS_PROXY, "redirect: change destination: %"PRIx64" send error %d", pr->dest, rv);
					as_fabric_msg_put(pr->fab_msg);
				}
			}

			pthread_mutex_unlock(pr_lock);
		}
		as_fabric_msg_put(m);
		break;
		default:
			cf_debug(AS_PROXY, "proxy_msg_fn: received unknown, unsupported message %d from remote endpoint", op);
			msg_dump(m, "proxy received unknown msg");
			as_fabric_msg_put(m);
			break;
	} // end switch

	return 0;
} // end proxy_msg_fn()
Пример #21
0
// Set of threads which talk to client over the connection for doing the needful
// processing. Note that once fd is assigned to a thread all the work on that fd
// is done by that thread. Fair fd usage is expected of the client. First thread
// is special - also does accept [listens for new connections]. It is the only
// thread which does it.
void *
thr_demarshal(void *arg)
{
	cf_socket_cfg *s, *ls;
	// Create my epoll fd, register in the global list.
	struct epoll_event ev;
	int nevents, i, n, epoll_fd;
	cf_clock last_fd_print = 0;

#if defined(USE_SYSTEMTAP)
	uint64_t nodeid = g_config.self_node;
#endif

	// Early stage aborts; these will cause faults in process scope.
	cf_assert(arg, AS_DEMARSHAL, CF_CRITICAL, "invalid argument");
	s = &g_config.socket;
	ls = &g_config.localhost_socket;

#ifdef USE_JEM
	int orig_arena;
	if (0 > (orig_arena = jem_get_arena())) {
		cf_crash(AS_DEMARSHAL, "Failed to get original arena for thr_demarshal()!");
	} else {
		cf_info(AS_DEMARSHAL, "Saved original JEMalloc arena #%d for thr_demarshal()", orig_arena);
	}
#endif

	// Figure out my thread index.
	pthread_t self = pthread_self();
	int thr_id;
	for (thr_id = 0; thr_id < MAX_DEMARSHAL_THREADS; thr_id++) {
		if (0 != pthread_equal(g_demarshal_args->dm_th[thr_id], self))
			break;
	}

	if (thr_id == MAX_DEMARSHAL_THREADS) {
		cf_debug(AS_FABRIC, "Demarshal thread could not figure own ID, bogus, exit, fu!");
		return(0);
	}

	// First thread accepts new connection at interface socket.
	if (thr_id == 0) {
		demarshal_file_handle_init();
		epoll_fd = epoll_create(EPOLL_SZ);
		if (epoll_fd == -1)
			cf_crash(AS_DEMARSHAL, "epoll_create(): %s", cf_strerror(errno));

		memset(&ev, 0, sizeof (ev));
		ev.events = EPOLLIN | EPOLLERR | EPOLLHUP;
		ev.data.fd = s->sock;
		if (0 > epoll_ctl(epoll_fd, EPOLL_CTL_ADD, s->sock, &ev))
			cf_crash(AS_DEMARSHAL, "epoll_ctl(): %s", cf_strerror(errno));
		cf_info(AS_DEMARSHAL, "Service started: socket %s:%d", s->addr, s->port);

		if (ls->sock) {
			ev.events = EPOLLIN | EPOLLERR | EPOLLHUP;
			ev.data.fd = ls->sock;
			if (0 > epoll_ctl(epoll_fd, EPOLL_CTL_ADD, ls->sock, &ev))
			  cf_crash(AS_DEMARSHAL, "epoll_ctl(): %s", cf_strerror(errno));
			cf_info(AS_DEMARSHAL, "Service also listening on localhost socket %s:%d", ls->addr, ls->port);
		}
	}
	else {
		epoll_fd = epoll_create(EPOLL_SZ);
		if (epoll_fd == -1)
			cf_crash(AS_DEMARSHAL, "epoll_create(): %s", cf_strerror(errno));
	}

	g_demarshal_args->epoll_fd[thr_id] = epoll_fd;
	cf_detail(AS_DEMARSHAL, "demarshal thread started: id %d", thr_id);

	int id_cntr = 0;

	// Demarshal transactions from the socket.
	for ( ; ; ) {
		struct epoll_event events[EPOLL_SZ];

		cf_detail(AS_DEMARSHAL, "calling epoll");

		nevents = epoll_wait(epoll_fd, events, EPOLL_SZ, -1);

		if (0 > nevents) {
			cf_debug(AS_DEMARSHAL, "epoll_wait() returned %d ; errno = %d (%s)", nevents, errno, cf_strerror(errno));
		}

		cf_detail(AS_DEMARSHAL, "epoll event received: nevents %d", nevents);

		uint64_t now_ns = cf_getns();
		uint64_t now_ms = now_ns / 1000000;

		// Iterate over all events.
		for (i = 0; i < nevents; i++) {
			if ((s->sock == events[i].data.fd) || (ls->sock == events[i].data.fd)) {
				// Accept new connections on the service socket.
				int csocket = -1;
				struct sockaddr_in caddr;
				socklen_t clen = sizeof(caddr);
				char cpaddr[64];

				if (-1 == (csocket = accept(events[i].data.fd, (struct sockaddr *)&caddr, &clen))) {
					// This means we're out of file descriptors - could be a SYN
					// flood attack or misbehaving client. Eventually we'd like
					// to make the reaper fairer, but for now we'll just have to
					// ignore the accept error and move on.
					if ((errno == EMFILE) || (errno == ENFILE)) {
						if (last_fd_print != (cf_getms() / 1000L)) {
							cf_info(AS_DEMARSHAL, " warning: hit OS file descript limit (EMFILE on accept), consider raising limit");
							last_fd_print = cf_getms() / 1000L;
						}
						continue;
					}
					cf_crash(AS_DEMARSHAL, "accept: %s (errno %d)", cf_strerror(errno), errno);
				}

				// Get the client IP address in string form.
				if (caddr.sin_family == AF_INET) {
					if (NULL == inet_ntop(AF_INET, &caddr.sin_addr.s_addr, (char *)cpaddr, sizeof(cpaddr))) {
						cf_crash(AS_DEMARSHAL, "inet_ntop(): %s (errno %d)", cf_strerror(errno), errno);
					}
				}
				else if (caddr.sin_family == AF_INET6) {
					struct sockaddr_in6* addr_in6 = (struct sockaddr_in6*)&caddr;

					if (NULL == inet_ntop(AF_INET6, &addr_in6->sin6_addr, (char *)cpaddr, sizeof(cpaddr))) {
						cf_crash(AS_DEMARSHAL, "inet_ntop(): %s (errno %d)", cf_strerror(errno), errno);
					}
				}
				else {
					cf_crash(AS_DEMARSHAL, "unknown address family %u", caddr.sin_family);
				}

				cf_detail(AS_DEMARSHAL, "new connection: %s (fd %d)", cpaddr, csocket);

				// Validate the limit of protocol connections we allow.
				uint32_t conns_open = g_config.proto_connections_opened - g_config.proto_connections_closed;
				if (conns_open > g_config.n_proto_fd_max) {
					if ((last_fd_print + 5000L) < cf_getms()) { // no more than 5 secs
						cf_warning(AS_DEMARSHAL, "dropping incoming client connection: hit limit %d connections", conns_open);
						last_fd_print = cf_getms();
					}
					shutdown(csocket, SHUT_RDWR);
					close(csocket);
					csocket = -1;
					continue;
				}

				// Set the socket to nonblocking.
				if (-1 == cf_socket_set_nonblocking(csocket)) {
					cf_info(AS_DEMARSHAL, "unable to set client socket to nonblocking mode");
					shutdown(csocket, SHUT_RDWR);
					close(csocket);
					csocket = -1;
					continue;
				}

				// Create as_file_handle and queue it up in epoll_fd for further
				// communication on one of the demarshal threads.
				as_file_handle *fd_h = cf_rc_alloc(sizeof(as_file_handle));
				if (!fd_h) {
					cf_crash(AS_DEMARSHAL, "malloc");
				}

				sprintf(fd_h->client, "%s:%d", cpaddr, ntohs(caddr.sin_port));
				fd_h->fd = csocket;

				fd_h->last_used = cf_getms();
				fd_h->reap_me = false;
				fd_h->trans_active = false;
				fd_h->proto = 0;
				fd_h->proto_unread = 0;
				fd_h->fh_info = 0;
				fd_h->security_filter = as_security_filter_create();

				// Insert into the global table so the reaper can manage it. Do
				// this before queueing it up for demarshal threads - once
				// EPOLL_CTL_ADD is done it's difficult to back out (if insert
				// into global table fails) because fd state could be anything.
				cf_rc_reserve(fd_h);

				pthread_mutex_lock(&g_file_handle_a_LOCK);

				int j;
				bool inserted = true;

				if (0 != cf_queue_pop(g_freeslot, &j, CF_QUEUE_NOWAIT)) {
					inserted = false;
				}
				else {
					g_file_handle_a[j] = fd_h;
				}

				pthread_mutex_unlock(&g_file_handle_a_LOCK);

				if (!inserted) {
					cf_info(AS_DEMARSHAL, "unable to add socket to file handle table");
					shutdown(csocket, SHUT_RDWR);
					close(csocket);
					csocket = -1;
					cf_rc_free(fd_h); // will free even with ref-count of 2
				}
				else {
					// Place the client socket in the event queue.
					memset(&ev, 0, sizeof(ev));
					ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP ;
					ev.data.ptr = fd_h;

					// Round-robin pick up demarshal thread epoll_fd and add
					// this new connection to epoll.
					int id;
					while (true) {
						id = (id_cntr++) % g_demarshal_args->num_threads;
						if (g_demarshal_args->epoll_fd[id] != 0) {
							break;
						}
					}

					fd_h->epoll_fd = g_demarshal_args->epoll_fd[id];

					if (0 > (n = epoll_ctl(fd_h->epoll_fd, EPOLL_CTL_ADD, csocket, &ev))) {
						cf_info(AS_DEMARSHAL, "unable to add socket to event queue of demarshal thread %d %d", id, g_demarshal_args->num_threads);
						pthread_mutex_lock(&g_file_handle_a_LOCK);
						fd_h->reap_me = true;
						as_release_file_handle(fd_h);
						fd_h = 0;
						pthread_mutex_unlock(&g_file_handle_a_LOCK);
					}
					else {
						cf_atomic_int_incr(&g_config.proto_connections_opened);
					}
				}
			}
			else {
				bool has_extra_ref   = false;
				as_file_handle *fd_h = events[i].data.ptr;
				if (fd_h == 0) {
					cf_info(AS_DEMARSHAL, "event with null handle, continuing");
					goto NextEvent;
				}

				cf_detail(AS_DEMARSHAL, "epoll connection event: fd %d, events 0x%x", fd_h->fd, events[i].events);

				// Process data on an existing connection: this might be more
				// activity on an already existing transaction, so we have some
				// state to manage.
				as_proto *proto_p = 0;
				int fd = fd_h->fd;

				if (events[i].events & (EPOLLRDHUP | EPOLLERR | EPOLLHUP)) {
					cf_detail(AS_DEMARSHAL, "proto socket: remote close: fd %d event %x", fd, events[i].events);
					// no longer in use: out of epoll etc
					goto NextEvent_FD_Cleanup;
				}

				if (fd_h->trans_active) {
					goto NextEvent;
				}

				// If pointer is NULL, then we need to create a transaction and
				// store it in the buffer.
				if (fd_h->proto == NULL) {
					as_proto proto;
					int sz;

					/* Get the number of available bytes */
					if (-1 == ioctl(fd, FIONREAD, &sz)) {
						cf_info(AS_DEMARSHAL, "unable to get number of available bytes");
						goto NextEvent_FD_Cleanup;
					}

					// If we don't have enough data to fill the message buffer,
					// just wait and we'll come back to this one. However, we'll
					// let messages with zero size through, since they are
					// likely errors. We don't cleanup the FD in this case since
					// we'll get more data on it.
					if (sz < sizeof(as_proto) && sz != 0) {
						goto NextEvent;
					}

					// Do a preliminary read of the header into a stack-
					// allocated structure, so that later on we can allocate the
					// entire message buffer.
					if (0 >= (n = cf_socket_recv(fd, &proto, sizeof(as_proto), MSG_WAITALL))) {
						cf_detail(AS_DEMARSHAL, "proto socket: read header fail: error: rv %d sz was %d errno %d", n, sz, errno);
						goto NextEvent_FD_Cleanup;
					}

					if (proto.version != PROTO_VERSION &&
							// For backward compatibility, allow version 0 with
							// security messages.
							! (proto.version == 0 && proto.type == PROTO_TYPE_SECURITY)) {
						cf_warning(AS_DEMARSHAL, "proto input from %s: unsupported proto version %u",
								fd_h->client, proto.version);
						goto NextEvent_FD_Cleanup;
					}

					// Swap the necessary elements of the as_proto.
					as_proto_swap(&proto);

					if (proto.sz > PROTO_SIZE_MAX) {
						cf_warning(AS_DEMARSHAL, "proto input from %s: msg greater than %d, likely request from non-Aerospike client, rejecting: sz %"PRIu64,
								fd_h->client, PROTO_SIZE_MAX, proto.sz);
						goto NextEvent_FD_Cleanup;
					}

#ifdef USE_JEM
					// Attempt to peek the namespace and set the JEMalloc arena accordingly.
					size_t peeked_data_sz = 0;
					size_t min_field_sz = sizeof(uint32_t) + sizeof(char);
					size_t min_as_msg_sz = sizeof(as_msg) + min_field_sz;
					size_t peekbuf_sz = 2048; // (Arbitrary "large enough" size for peeking the fields of "most" AS_MSGs.)
					uint8_t peekbuf[peekbuf_sz];
					if (PROTO_TYPE_AS_MSG == proto.type) {
						size_t offset = sizeof(as_msg);
						// Number of bytes to peek from the socket.
//						size_t peek_sz = peekbuf_sz;                 // Peak up to the size of the peek buffer.
						size_t peek_sz = MIN(proto.sz, peekbuf_sz);  // Peek only up to the minimum necessary number of bytes.
						if (!(peeked_data_sz = cf_socket_recv(fd, peekbuf, peek_sz, 0))) {
							// That's actually legitimate. The as_proto may have gone into one
							// packet, the as_msg into the next one, which we haven't yet received.
							// This just "never happened" without async.
							cf_detail(AS_DEMARSHAL, "could not peek the as_msg header, expected %zu byte(s)", peek_sz);
						}
						if (peeked_data_sz > min_as_msg_sz) {
//							cf_debug(AS_DEMARSHAL, "(Peeked %zu bytes.)", peeked_data_sz);
							if (peeked_data_sz > proto.sz) {
								cf_warning(AS_DEMARSHAL, "Received unexpected extra data from client %s socket %d when peeking as_proto!", fd_h->client, fd);
								log_as_proto_and_peeked_data(&proto, peekbuf, peeked_data_sz);
								goto NextEvent_FD_Cleanup;
							}

							if (((as_msg*)peekbuf)->info1 & AS_MSG_INFO1_BATCH) {
								jem_set_arena(orig_arena);
							} else {
								uint16_t n_fields = ntohs(((as_msg *) peekbuf)->n_fields), field_num = 0;
								bool found = false;
	//							cf_debug(AS_DEMARSHAL, "Found %d AS_MSG fields", n_fields);
								while (!found && (field_num < n_fields)) {
									as_msg_field *field = (as_msg_field *) (&peekbuf[offset]);
									uint32_t value_sz = ntohl(field->field_sz) - 1;
	//								cf_debug(AS_DEMARSHAL, "Field #%d offset: %lu", field_num, offset);
	//								cf_debug(AS_DEMARSHAL, "\tvalue_sz %u", value_sz);
	//								cf_debug(AS_DEMARSHAL, "\ttype %d", field->type);
									if (AS_MSG_FIELD_TYPE_NAMESPACE == field->type) {
										if (value_sz >= AS_ID_NAMESPACE_SZ) {
											cf_warning(AS_DEMARSHAL, "namespace too long (%u) in as_msg", value_sz);
											goto NextEvent_FD_Cleanup;
										}
										char ns[AS_ID_NAMESPACE_SZ];
										found = true;
										memcpy(ns, field->data, value_sz);
										ns[value_sz] = '\0';
	//									cf_debug(AS_DEMARSHAL, "Found ns \"%s\" in field #%d.", ns, field_num);
										jem_set_arena(as_namespace_get_jem_arena(ns));
									} else {
	//									cf_debug(AS_DEMARSHAL, "Message field %d is not namespace (type %d) ~~ Reading next field", field_num, field->type);
										field_num++;
										offset += sizeof(as_msg_field) + value_sz;
										if (offset >= peeked_data_sz) {
											break;
										}
									}
								}
								if (!found) {
									cf_warning(AS_DEMARSHAL, "Can't get namespace from AS_MSG (peeked %zu bytes) ~~ Using default thr_demarshal arena.", peeked_data_sz);
									jem_set_arena(orig_arena);
								}
							}
						} else {
							jem_set_arena(orig_arena);
						}
					} else {
						jem_set_arena(orig_arena);
					}
#endif

					// Allocate the complete message buffer.
					proto_p = cf_malloc(sizeof(as_proto) + proto.sz);

					cf_assert(proto_p, AS_DEMARSHAL, CF_CRITICAL, "allocation: %zu %s", (sizeof(as_proto) + proto.sz), cf_strerror(errno));
					memcpy(proto_p, &proto, sizeof(as_proto));

#ifdef USE_JEM
					// Jam in the peeked data.
					if (peeked_data_sz) {
						memcpy(proto_p->data, &peekbuf, peeked_data_sz);
					}
					fd_h->proto_unread = proto_p->sz - peeked_data_sz;
#else
					fd_h->proto_unread = proto_p->sz;
#endif
					fd_h->proto = (void *) proto_p;
				}
				else {
					proto_p = fd_h->proto;
				}

				if (fd_h->proto_unread > 0) {

					// Read the data.
					n = cf_socket_recv(fd, proto_p->data + (proto_p->sz - fd_h->proto_unread), fd_h->proto_unread, 0);
					if (0 >= n) {
						if (errno == EAGAIN) {
							continue;
						}
						cf_info(AS_DEMARSHAL, "receive socket: fail? n %d errno %d %s closing connection.", n, errno, cf_strerror(errno));
						goto NextEvent_FD_Cleanup;
					}

					// Decrement bytes-unread counter.
					cf_detail(AS_DEMARSHAL, "read fd %d (%d %d)", fd, n, fd_h->proto_unread);
					fd_h->proto_unread -= n;
				}

				// Check for a finished read.
				if (0 == fd_h->proto_unread) {

					// It's only really live if it's injecting a transaction.
					fd_h->last_used = now_ms;

					thr_demarshal_pause(fd_h); // pause reading while the transaction is in progress
					fd_h->proto = 0;
					fd_h->proto_unread = 0;

					// INIT_TR
					as_transaction tr;
					as_transaction_init(&tr, NULL, (cl_msg *)proto_p);

					cf_rc_reserve(fd_h);
					has_extra_ref   = true;
					tr.proto_fd_h   = fd_h;
					tr.start_time   = now_ns; // set transaction start time
					tr.preprocessed = false;

					if (! as_proto_is_valid_type(proto_p)) {
						cf_warning(AS_DEMARSHAL, "unsupported proto message type %u", proto_p->type);
						// We got a proto message type we don't recognize, so it
						// may not do any good to send back an as_msg error, but
						// it's the best we can do. At least we can keep the fd.
						as_transaction_demarshal_error(&tr, AS_PROTO_RESULT_FAIL_UNKNOWN);
						cf_atomic_int_incr(&g_config.proto_transactions);
						goto NextEvent;
					}

					if (g_config.microbenchmarks) {
						histogram_insert_data_point(g_config.demarshal_hist, now_ns);
						tr.microbenchmark_time = cf_getns();
					}

					// Check if it's compressed.
					if (tr.msgp->proto.type == PROTO_TYPE_AS_MSG_COMPRESSED) {
						// Decompress it - allocate buffer to hold decompressed
						// packet.
						uint8_t *decompressed_buf = NULL;
						size_t decompressed_buf_size = 0;
						int rv = 0;
						if ((rv = as_packet_decompression((uint8_t *)proto_p, &decompressed_buf, &decompressed_buf_size))) {
							cf_warning(AS_DEMARSHAL, "as_proto decompression failed! (rv %d)", rv);
							cf_warning_binary(AS_DEMARSHAL, proto_p, sizeof(as_proto) + proto_p->sz, CF_DISPLAY_HEX_SPACED, "compressed proto_p");
							as_transaction_demarshal_error(&tr, AS_PROTO_RESULT_FAIL_UNKNOWN);
							cf_atomic_int_incr(&g_config.proto_transactions);
							goto NextEvent;
						}
						// Count the packets.
						cf_atomic_int_add(&g_config.stat_compressed_pkts_received, 1);
						// Free the compressed packet since we'll be using the
						// decompressed packet from now on.
						cf_free(proto_p);
						proto_p = NULL;
						// Get original packet.
						tr.msgp = (cl_msg *)decompressed_buf;
						as_proto_swap(&(tr.msgp->proto));

						if (! as_proto_wrapped_is_valid(&tr.msgp->proto, decompressed_buf_size)) {
							cf_warning(AS_DEMARSHAL, "decompressed unusable proto: version %u, type %u, sz %lu [%lu]",
									tr.msgp->proto.version, tr.msgp->proto.type, tr.msgp->proto.sz, decompressed_buf_size);
							as_transaction_demarshal_error(&tr, AS_PROTO_RESULT_FAIL_UNKNOWN);
							cf_atomic_int_incr(&g_config.proto_transactions);
							goto NextEvent;
						}
					}

					// Security protocol transactions.
					if (tr.msgp->proto.type == PROTO_TYPE_SECURITY) {
						as_security_transact(&tr);
						cf_atomic_int_incr(&g_config.proto_transactions);
						goto NextEvent;
					}

					// Info protocol requests.
					if (tr.msgp->proto.type == PROTO_TYPE_INFO) {
						if (as_info(&tr)) {
							cf_warning(AS_DEMARSHAL, "Info request failed to be enqueued ~~ Freeing protocol buffer");
							goto NextEvent_FD_Cleanup;
						}
						cf_atomic_int_incr(&g_config.proto_transactions);
						goto NextEvent;
					}

					ASD_TRANS_DEMARSHAL(nodeid, (uint64_t) tr.msgp);

					// Fast path for batch requests.
					if (tr.msgp->msg.info1 & AS_MSG_INFO1_BATCH) {
						as_batch_queue_task(&tr);
						cf_atomic_int_incr(&g_config.proto_transactions);
						goto NextEvent;
					}

					// Either process the transaction directly in this thread,
					// or queue it for processing by another thread (tsvc/info).
					if (0 != thr_tsvc_process_or_enqueue(&tr)) {
						cf_warning(AS_DEMARSHAL, "Failed to queue transaction to the service thread");
						goto NextEvent_FD_Cleanup;
					}
					else {
						cf_atomic_int_incr(&g_config.proto_transactions);
					}
				}

				// Jump the proto message free & FD cleanup. If we get here, the
				// above operations went smoothly. The message free & FD cleanup
				// job is handled elsewhere as directed by
				// thr_tsvc_process_or_enqueue().
				goto NextEvent;

NextEvent_FD_Cleanup:
				// If we allocated memory for the incoming message, free it.
				if (proto_p) {
					cf_free(proto_p);
					fd_h->proto = 0;
				}
				// If fd has extra reference for transaction, release it.
				if (has_extra_ref) {
					cf_rc_release(fd_h);
				}
				// Remove the fd from the events list.
				if (epoll_ctl(epoll_fd, EPOLL_CTL_DEL, fd, 0) < 0) {
					cf_crash(AS_DEMARSHAL, "unable to remove socket FD %d from epoll instance FD %d: %d (%s)",
							fd, epoll_fd, errno, cf_strerror(errno));
				}
				pthread_mutex_lock(&g_file_handle_a_LOCK);
				fd_h->reap_me = true;
				as_release_file_handle(fd_h);
				fd_h = 0;
				pthread_mutex_unlock(&g_file_handle_a_LOCK);
NextEvent:
				;
			}

			// We should never be canceled externally, but just in case...
			pthread_testcancel();
		}
	}

	return NULL;
}
Пример #22
0
// Build response to batch request.
static void
batch_build_response(batch_transaction* btr, cf_buf_builder** bb_r)
{
	as_namespace* ns = btr->ns;
	batch_digests *bmds = btr->digests;
	bool get_data = btr->get_data;
	uint32_t yield_count = 0;

	for (int i = 0; i < bmds->n_digests; i++)
	{
		batch_digest *bmd = &bmds->digest[i];

		if (bmd->done == false) {
			// try to get the key
			as_partition_reservation rsv;
			AS_PARTITION_RESERVATION_INIT(rsv);
			cf_node other_node = 0;
			uint64_t cluster_key;

			if (! *bb_r) {
				*bb_r = cf_buf_builder_create_size(1024 * 4);
			}

			int rv = as_partition_reserve_read(ns, as_partition_getid(bmd->keyd), &rsv, &other_node, &cluster_key);

			if (rv == 0) {
				cf_atomic_int_incr(&g_config.batch_tree_count);

				as_index_ref r_ref;
				r_ref.skip_lock = false;
				int rec_rv = as_record_get(rsv.tree, &bmd->keyd, &r_ref, ns);

				if (rec_rv == 0) {
					as_index *r = r_ref.r;

					// Check to see this isn't an expired record waiting to die.
					if (r->void_time && r->void_time < as_record_void_time_get()) {
						as_msg_make_error_response_bufbuilder(&bmd->keyd, AS_PROTO_RESULT_FAIL_NOTFOUND, bb_r, ns->name);
					}
					else {
						// Make sure it's brought in from storage if necessary.
						as_storage_rd rd;
						if (get_data) {
							as_storage_record_open(ns, r, &rd, &r->key);
							rd.n_bins = as_bin_get_n_bins(r, &rd);
						}

						// Note: this array must stay in scope until the
						// response for this record has been built, since in the
						// get data w/ record on device case, it's copied by
						// reference directly into the record descriptor.
						as_bin stack_bins[!get_data || rd.ns->storage_data_in_memory ? 0 : rd.n_bins];

						if (get_data) {
							// Figure out which bins you want - for now, all.
							rd.bins = as_bin_get_all(r, &rd, stack_bins);
							rd.n_bins = as_bin_inuse_count(&rd);
						}

						as_msg_make_response_bufbuilder(r, (get_data ? &rd : NULL), bb_r, !get_data, (get_data ? NULL : ns->name), true, false, btr->binlist);

						if (get_data) {
							as_storage_record_close(r, &rd);
						}
					}
					as_record_done(&r_ref, ns);
				}
				else {
					// TODO - what about empty records?
					cf_debug(AS_BATCH, "batch_build_response: as_record_get returned %d : key %"PRIx64, rec_rv, *(uint64_t *)&bmd->keyd);
					as_msg_make_error_response_bufbuilder(&bmd->keyd, AS_PROTO_RESULT_FAIL_NOTFOUND, bb_r, ns->name);
				}

				bmd->done = true;

				as_partition_release(&rsv);
				cf_atomic_int_decr(&g_config.batch_tree_count);
			}
			else {
				cf_debug(AS_BATCH, "batch_build_response: partition reserve read failed: rv %d", rv);

				as_msg_make_error_response_bufbuilder(&bmd->keyd, AS_PROTO_RESULT_FAIL_NOTFOUND, bb_r, ns->name);

				if (other_node != 0) {
					bmd->node = other_node;
					cf_debug(AS_BATCH, "other_node is: %p.", other_node);
				} else {
					cf_debug(AS_BATCH, "other_node is NULL.");
				}
			}

			yield_count++;
			if (yield_count % g_config.batch_priority == 0) {
				usleep(1);
			}
		}
	}
}
Пример #23
0
static void* 
async_receiver_fn(void *thdata)
{
	int 		rv = -1;
	bool 		network_error = false;
	cl_async_work	*workitem = NULL;
	// cl_async_work	*tmpworkitem = NULL;
	as_msg 		msg;
	cf_queue	*q_to_use = NULL;
	cl_cluster_node	*thisnode = NULL;

	uint8_t		rd_stack_buf[STACK_BUF_SZ];	
	uint8_t		*rd_buf = rd_stack_buf;
	size_t		rd_buf_sz = 0;

	uint64_t	acktrid;
	// uint64_t	starttime, endtime;
	int		progress_timeout_ms;
	unsigned int 	thread_id = cf_atomic32_incr(&g_thread_count);

	if (thdata == NULL) {
		q_to_use = g_cl_async_q;
	} else {
		thisnode = (cl_cluster_node *)thdata;
		q_to_use = thisnode->asyncwork_q;
	}
    
	//Infinite loop which keeps picking work items from the list and try to find the end result 
	while(1) {
		network_error = false;
#if ONEASYNCFD
		if(thisnode->dunned == true) {
			do {
				rv = cf_queue_pop(thisnode->asyncwork_q, &workitem, CF_QUEUE_NOWAIT);
				if (rv == CF_QUEUE_OK) {
					cl_cluster_node_put(thisnode);
					free(workitem);
				}
			} while (rv == CF_QUEUE_OK);

			//We want to delete all the workitems of this node
			shash_reduce_delete(g_cl_async_hashtab, cl_del_node_asyncworkitems, thisnode);
			break;
		}
#endif
		//This call will block if there is no element in the queue
		cf_queue_pop(q_to_use, &workitem, CF_QUEUE_FOREVER);
		//TODO: What if the node gets dunned while this pop call is blocked ?
#if ONEASYNCFD
		//cf_debug("Elements remaining in this node's queue=%d, Hash table size=%d",
		//		cf_queue_sz(thisnode->asyncwork_q), shash_get_size(g_cl_async_hashtab));
#endif

		// If we have no progress in 50ms, we should move to the next workitem 
		// and revisit this workitem at a later stage
		progress_timeout_ms = DEFAULT_PROGRESS_TIMEOUT;

		// Read into this fine cl_msg, which is the short header
		rv = cf_socket_read_timeout(workitem->fd, (uint8_t *) &msg, sizeof(as_msg), workitem->deadline, progress_timeout_ms);
		if (rv) {
#if DEBUG
			cf_debug("Citrusleaf: error when reading header from server - rv %d fd %d", rv, workitem->fd);
#endif
			if (rv != ETIMEDOUT) {
				cf_error("Citrusleaf: error when reading header from server - rv %d fd %d",rv,workitem->fd);
				network_error = true;
				goto Error;
			} else {
				goto Retry;
			}

		}
#ifdef DEBUG_VERBOSE
		dump_buf("read header from cluster", (uint8_t *) &msg, sizeof(cl_msg));
#endif
		cl_proto_swap(&msg.proto);
		cl_msg_swap_header(&msg.m);

		// second read for the remainder of the message 
		rd_buf_sz =  msg.proto.sz  - msg.m.header_sz;
		if (rd_buf_sz > 0) {
			if (rd_buf_sz > sizeof(rd_stack_buf)) {
				rd_buf = malloc(rd_buf_sz);
				if (!rd_buf) {
					cf_error("malloc fail: trying %zu",rd_buf_sz);
					rv = -1; 
					goto Error; 
				}
			}

			rv = cf_socket_read_timeout(workitem->fd, rd_buf, rd_buf_sz, workitem->deadline, progress_timeout_ms);
			if (rv) {
				//We already read some part of the message before but failed to read the
				//remaining data for whatever reason (network error or timeout). We cannot
				//reread as we already read partial data. Declare this as error.
				cf_error("Timeout after reading the header but before reading the body");
				goto Error;
			}
#ifdef DEBUG_VERBOSE
			dump_buf("read body from cluster", rd_buf, rd_buf_sz);
#endif	
		}

		rv = CITRUSLEAF_OK;
		goto Ok;

Retry:
		//We are trying to postpone the reading
		if (workitem->deadline && workitem->deadline < cf_getms()) {
			cf_error("async receiver: out of time : deadline %"PRIu64" now %"PRIu64,
					workitem->deadline, cf_getms());
			//cf_error("async receiver: Workitem missed the final deadline");
			rv = CITRUSLEAF_FAIL_TIMEOUT;
			goto Error;
		} else {
			//We have time. Push the element back to the queue to be considered later
			cf_queue_push(q_to_use, &workitem);
		}

		//If we allocated memory in this loop, release it.
		if (rd_buf && (rd_buf != rd_stack_buf)) {
			free(rd_buf);
		}

		cf_atomic_int_incr(&g_async_stats.retries);

		continue;

Error:
		if (network_error == true) {
			/* 
			 * In case of Async work (for XDS), it may be extreme to
			 * dun a node in case of network error. We just cleanup
			 * things and retry to connect to the remote cluster.
			 * The network error may be a transient one.
			 */
		} 

#if ONEASYNCFD
//Do not close FD
#else
		//We do not know the state of FD. It may have pending data to be read.
		//We cannot reuse the FD. So, close it to be on safe side.
		cf_error("async receiver: Closing the fd %d because of error", workitem->fd);
		cf_close(workitem->fd);
		workitem->fd = -1;
#endif
		cf_atomic_int_incr(&g_async_stats.dropouts);
		//Continue down with what we do during an Ok

		//Inform the caller that there is no response from the server for this workitem.
		//No response does not mean that the work is not done. The work might be 
		//successfully completed on the server side, we just didnt get response for it.
		if (g_fail_cb_fn) {
			g_fail_cb_fn(workitem->udata, rv, workitem->starttime);
		}
Ok:
		//rd_buf may not be there during an error condition.
		if (rd_buf && (rv == CITRUSLEAF_OK)) {
			//As of now, async functionality is there only for put call.
			//In put call, we do not get anything back other than the trid field.
			//So, just pass variable to get back the trid and ignore others.
			if (0 != cl_parse(&msg.m, rd_buf, rd_buf_sz, NULL, NULL, NULL, &acktrid, NULL)) {
				rv = CITRUSLEAF_FAIL_UNKNOWN;
			}
			else {
				rv = msg.m.result_code;
				if (workitem->trid != acktrid) {
#if ONEASYNCFD
					//It is likely that we may get response for a different trid.
					//Just delete the correct one from the queue 
					//put back the current workitem back in the queue.
					shash_get(g_cl_async_hashtab, &acktrid, &tmpworkitem);
					cf_queue_delete(q_to_use, &tmpworkitem, true);
					cf_queue_push(q_to_use, &workitem);
					//From now on workitem will be the one for which we got ack
					workitem = tmpworkitem;
#endif
#ifdef DEBUG
					cf_debug("Got reply for a different trid. Expected=%"PRIu64" Got=%"PRIu64" FD=%d",
							workitem->trid, acktrid, workitem->fd);
#endif
				}
			}

			if (g_success_cb_fn) {
				g_success_cb_fn(workitem->udata, rv, workitem->starttime);
			}
		}

		//Remember to put back the FD into the pool, if it is re-usable.
		if (workitem->fd != -1) {
			cl_cluster_node_fd_put(workitem->node, workitem->fd, true);
		}
		//Also decrement the reference count for this node
		cl_cluster_node_put(workitem->node);

#if ONEASYNCFD
		//Delete the item from the global hashtable
		if (shash_delete(g_cl_async_hashtab, &workitem->trid) != SHASH_OK)
		{
#if DEBUG
			cf_debug("Failure while trying to delete trid=%"PRIu64" from hashtable", workitem->trid);
#endif
		}
#endif

		//Push it back into the free pool. If the attempt fails, free it.
		if (cf_queue_push(g_cl_workitems_freepool_q, &workitem) == -1) {
			free(workitem);
		}

		//If we allocated memory in this loop, release it.
		if (rd_buf && (rd_buf != rd_stack_buf)) {
			free(rd_buf);
		}

		// Kick this thread out if its ID is greater than total
		if (thread_id > cf_atomic32_get(g_async_num_threads)) {
			cf_atomic32_decr(&g_thread_count);
			return NULL;
		}
	}//The infnite loop

	return NULL;
}
Пример #24
0
/*
 * Function based on the UDF result and the result of UDF call along
 * with the optype information update the UDF stats and LDT stats.
 *
 * Parameter:
 *  	op:           execute optype
 *  	is_success :  In case the UDF operation was successful
 *  	ret        :  return value of UDF execution
 *
 *  Returns: nothing
*/
void
udf_rw_update_stats(as_namespace *ns, udf_optype op, int ret, bool is_success)
{
	if (UDF_OP_IS_LDT(op)) {
		if (UDF_OP_IS_READ(op))        cf_atomic_int_incr(&ns->ldt_read_reqs);
		else if (UDF_OP_IS_DELETE(op)) cf_atomic_int_incr(&ns->ldt_delete_reqs);
		else if (UDF_OP_IS_WRITE (op)) cf_atomic_int_incr(&ns->ldt_write_reqs);

		if (ret == 0) {
			if (is_success) {
				if (UDF_OP_IS_READ(op))        cf_atomic_int_incr(&ns->ldt_read_success);
				else if (UDF_OP_IS_DELETE(op)) cf_atomic_int_incr(&ns->ldt_delete_success);
				else if (UDF_OP_IS_WRITE (op)) cf_atomic_int_incr(&ns->ldt_write_success);
			} else {
				cf_atomic_int_incr(&ns->ldt_errs);
			}
		} else {
			cf_atomic_int_incr(&g_config.udf_lua_errs);
		}
	} else {
		if (UDF_OP_IS_READ(op))        cf_atomic_int_incr(&g_config.udf_read_reqs);
		else if (UDF_OP_IS_DELETE(op)) cf_atomic_int_incr(&g_config.udf_delete_reqs);
		else if (UDF_OP_IS_WRITE (op)) cf_atomic_int_incr(&g_config.udf_write_reqs);

		if (ret == 0) {
			if (is_success) {
				if (UDF_OP_IS_READ(op))        cf_atomic_int_incr(&g_config.udf_read_success);
				else if (UDF_OP_IS_DELETE(op)) cf_atomic_int_incr(&g_config.udf_delete_success);
				else if (UDF_OP_IS_WRITE (op)) cf_atomic_int_incr(&g_config.udf_write_success);
			} else {
				if (UDF_OP_IS_READ(op))        cf_atomic_int_incr(&g_config.udf_read_errs_other);
				else if (UDF_OP_IS_DELETE(op)) cf_atomic_int_incr(&g_config.udf_delete_errs_other);
				else if (UDF_OP_IS_WRITE (op)) cf_atomic_int_incr(&g_config.udf_write_errs_other);
			}
		} else {
            cf_info(AS_UDF,"lua error, ret:%d",ret);
			cf_atomic_int_incr(&g_config.udf_lua_errs);
		}
	}
}
Пример #25
0
/**
 * Initialize a new UDF. This populates the udf_call from information
 * in the current transaction. If passed in transaction has req_data it is
 * assumed to be internal and the UDF information is picked from the udata
 * associated with it. TODO: Do not overload please define flag for this.
 *
 *
 * Parameter:
 * 		tr the transaction to build a udf_call from
 *
 * Returns"
 * 		return a new udf_call (Caller needs to free it up)
 * 		NULL in case of failure
 */
int
udf_call_init(udf_call * call, as_transaction * tr)
{

	call->active   = false;
	call->udf_type = AS_SCAN_UDF_NONE;
	as_msg_field *  filename = NULL;
	as_msg_field *  function = NULL;
	as_msg_field *  arglist =  NULL;

	if (tr->udata.req_udata) {
		udf_call *ucall = NULL;
		if (tr->udata.req_type == UDF_SCAN_REQUEST) {
			ucall = &((tscan_job *)(tr->udata.req_udata))->call;
		} else if (tr->udata.req_type == UDF_QUERY_REQUEST) {
			ucall = as_query_get_udf_call(tr->udata.req_udata);
		}

		if (ucall) {
			strncpy(call->filename, ucall->filename, sizeof(ucall->filename));
			strncpy(call->function, ucall->function, sizeof(ucall->function));
			call->transaction = tr;
			call->active      = true;
			call->arglist     = ucall->arglist;
			call->udf_type    = ucall->udf_type;
			if (tr->udata.req_type == UDF_SCAN_REQUEST) {
				cf_atomic_int_incr(&g_config.udf_scan_rec_reqs);
			} else if (tr->udata.req_type == UDF_QUERY_REQUEST) {
				cf_atomic_int_incr(&g_config.udf_query_rec_reqs);
			}
		}
		// TODO: return proper macros
		return 0;
	}

	// Check the type of udf
	as_msg_field *  op = NULL;
	op = as_msg_field_get(&tr->msgp->msg, AS_MSG_FIELD_TYPE_UDF_OP);
	if (!op) {
		// Normal udf operation, no special type
		call->udf_type = 0;
	} else {
		// We got a udf type from the server
		byte optype;
		memcpy(&optype, (byte *)op->data, sizeof(optype));
		if(optype == AS_SCAN_UDF_OP_UDF) {
			cf_debug(AS_UDF, "UDF scan op received");
			call->udf_type = AS_SCAN_UDF_OP_UDF;
		} else if(optype == AS_SCAN_UDF_OP_BACKGROUND) {
			cf_debug(AS_UDF, "UDF scan background op received");
			call->udf_type = AS_SCAN_UDF_OP_BACKGROUND;
		} else {
			cf_warning(AS_UDF, "Undefined udf type received over protocol");
			goto Cleanup;
		}
	}
	filename = as_msg_field_get(&tr->msgp->msg, AS_MSG_FIELD_TYPE_UDF_FILENAME);
	if ( filename ) {
		function = as_msg_field_get(&tr->msgp->msg, AS_MSG_FIELD_TYPE_UDF_FUNCTION);
		if ( function ) {
			arglist = as_msg_field_get(&tr->msgp->msg, AS_MSG_FIELD_TYPE_UDF_ARGLIST);
			if ( arglist ) {
				call->transaction = tr;
				as_msg_field_get_strncpy(filename, &call->filename[0], sizeof(call->filename));
				as_msg_field_get_strncpy(function, &call->function[0], sizeof(call->function));
				call->arglist = arglist;
				call->active = true;
				cf_detail(AS_UDF, "UDF Request Unpacked %s %s", call->filename, call->function);
				return 0;
			}
		}
	}
Cleanup:
	call->transaction = NULL;
	call->filename[0] = 0;
	call->function[0] = 0;
	call->arglist = NULL;

	return 1;
}
Пример #26
0
// Put batch request on a separate batch queue.
int
as_batch_direct_queue_task(as_transaction* tr)
{
	cf_atomic_int_incr(&g_config.batch_initiate);

	if (g_config.n_batch_threads <= 0) {
		cf_warning(AS_BATCH, "batch-threads has been disabled.");
		return AS_PROTO_RESULT_FAIL_BATCH_DISABLED;
	}

	as_msg* msg = &tr->msgp->msg;

	as_msg_field* nsfp = as_msg_field_get(msg, AS_MSG_FIELD_TYPE_NAMESPACE);
	if (! nsfp) {
		cf_warning(AS_BATCH, "Batch namespace is required.");
		return AS_PROTO_RESULT_FAIL_NAMESPACE;
	}

	as_msg_field* dfp = as_msg_field_get(msg, AS_MSG_FIELD_TYPE_DIGEST_RIPE_ARRAY);
	if (! dfp) {
		cf_warning(AS_BATCH, "Batch digests are required.");
		return AS_PROTO_RESULT_FAIL_PARAMETER;
	}

	uint n_digests = dfp->field_sz / sizeof(cf_digest);

	if (n_digests > g_config.batch_max_requests) {
		cf_warning(AS_BATCH, "Batch request size %u exceeds max %u.", n_digests, g_config.batch_max_requests);
		return AS_PROTO_RESULT_FAIL_BATCH_MAX_REQUESTS;
	}

	batch_transaction btr;
	btr.trid = tr->trid;
	btr.end_time = tr->end_time;
	btr.get_data = !(msg->info1 & AS_MSG_INFO1_GET_NOBINDATA);
	btr.complete = false;

	btr.ns = as_namespace_get_bymsgfield(nsfp);
	if (! btr.ns) {
		cf_warning(AS_BATCH, "Batch namespace is required.");
		return AS_PROTO_RESULT_FAIL_NAMESPACE;
	}

	// Create the master digest table.
	btr.digests = (batch_digests*) cf_malloc(sizeof(batch_digests) + (sizeof(batch_digest) * n_digests));
	if (! btr.digests) {
		cf_warning(AS_BATCH, "Failed to allocate memory for batch digests.");
		return AS_PROTO_RESULT_FAIL_UNKNOWN;
	}

	batch_digests* bmd = btr.digests;
	bmd->n_digests = n_digests;
	uint8_t* digest_field_data = dfp->data;

	for (int i = 0; i < n_digests; i++) {
		bmd->digest[i].done = false;
		bmd->digest[i].node = 0;
		memcpy(&bmd->digest[i].keyd, digest_field_data, sizeof(cf_digest));
		digest_field_data += sizeof(cf_digest);
	}

	btr.binlist = as_binlist_from_op(msg);
	btr.fd_h = tr->proto_fd_h;
	tr->proto_fd_h = 0;
	btr.fd_h->last_used = cf_getms();

	int status = as_thread_pool_queue_task_fixed(&batch_direct_thread_pool, &btr);
	
	if (status) {
		cf_warning(AS_BATCH, "Batch enqueue failed");
		return AS_PROTO_RESULT_FAIL_UNKNOWN;		
	}
	return 0;
}
Пример #27
0
/**
 * aerospike::create(record)
 * Function: udf_aerospike_rec_create
 *
 * Parameters:
 * 		as - as_aerospike
 *		rec - as_rec
 *
 * Return Values:
 * 		1 if record is being read or on a create, it already exists
 * 		o/w return value of udf_aerospike__execute_updates
 *
 * Description:
 * 		Create a new record in local storage.
 * 		The record will only be created if it does not exist.
 * 		This assumes the record has a digest that is valid for local storage.
 *
 *		Synchronization : object lock acquired by the transaction thread executing UDF.
 * 		Partition reservation takes place just before the transaction starts executing
 * 		( look for as_partition_reserve_udf in thr_tsvc.c )
 *
 * 		Callers:
 * 		lua interfacing function, mod_lua_aerospike_rec_create
 * 		The return value of udf_aerospike_rec_create is pushed on to the lua stack
 *
 * 		Notes:
 * 		The 'read' and 'exists' flag of udf_record are set to true.
*/
static int
udf_aerospike_rec_create(const as_aerospike * as, const as_rec * rec)
{
	int ret = udf_aerospike_param_check(as, rec, __FILE__, __LINE__);
	if (ret) {
		return ret;
	}

	udf_record * urecord  = (udf_record *) as_rec_source(rec);

	// make sure record isn't already successfully read
	if (urecord->flag & UDF_RECORD_FLAG_OPEN) {
		cf_detail(AS_UDF, "udf_aerospike_rec_create: Record Already Exists");
		return 1;
	}
	as_transaction *tr    = urecord->tr;
	as_index_ref   *r_ref = urecord->r_ref;
	as_storage_rd  *rd    = urecord->rd;
	as_index_tree  *tree  = tr->rsv.tree;
	bool is_subrec        = false;

	if (urecord->flag & UDF_RECORD_FLAG_IS_SUBRECORD) {
		tree      = tr->rsv.sub_tree;
		is_subrec = true;
	}

	// make sure we got the record as a create
	bool is_create = false;
	int rv = as_record_get_create(tree, &tr->keyd, r_ref, tr->rsv.ns, is_subrec);
	cf_detail_digest(AS_UDF, &tr->keyd, "Creating %sRecord",
			(urecord->flag & UDF_RECORD_FLAG_IS_SUBRECORD) ? "Sub" : "");

	// rv 0 means record exists, 1 means create, < 0 means fail
	// TODO: Verify correct result codes.
	if (rv == 1) {
		is_create = true;
	} else if (rv == 0) {
		// If it's an expired record, pretend it's a fresh create.
		if (as_record_is_expired(r_ref->r)) {
			as_record_destroy(r_ref->r, tr->rsv.ns);
			as_record_initialize(r_ref, tr->rsv.ns);
			cf_atomic_int_incr(&tr->rsv.ns->n_objects);
			is_create = true;
		} else {
			cf_warning(AS_UDF, "udf_aerospike_rec_create: Record Already Exists 2");
			as_record_done(r_ref, tr->rsv.ns);
			// DO NOT change it has special meaning for caller
			return 1;
		}
	} else if (rv < 0) {
		cf_warning(AS_UDF, "udf_aerospike_rec_create: Record Open Failed with rv=%d", rv);
		return rv;
	}

	// Associates the set name with the storage rec and index
	if (tr->msgp) {
		// Set the set name to index and close record if the setting the set name
		// is not successful
		int rv_set = as_transaction_has_set(tr) ?
				as_record_set_set_from_msg(r_ref->r, tr->rsv.ns, &tr->msgp->msg) : 0;
		if (rv_set != 0) {
			cf_warning(AS_UDF, "udf_aerospike_rec_create: Failed to set setname");
			if (is_create) {
				as_index_delete(tree, &tr->keyd);
			}
			as_record_done(r_ref, tr->rsv.ns);
			return 4;
		}
	}

	urecord->flag |= UDF_RECORD_FLAG_OPEN;
	cf_detail(AS_UDF, "Open %p %x %"PRIx64"", urecord, urecord->flag, *(uint64_t *)&tr->keyd);

	as_index *r    = r_ref->r;
	// open up storage
	as_storage_record_create(urecord->tr->rsv.ns, urecord->r_ref->r,
		urecord->rd, &urecord->tr->keyd);

	cf_detail(AS_UDF, "as_storage_record_create: udf_aerospike_rec_create: r %p rd %p",
		urecord->r_ref->r, urecord->rd);

	// If the message has a key, apply it to the record.
	if (! get_msg_key(tr, rd)) {
		cf_warning(AS_UDF, "udf_aerospike_rec_create: Can't store key");
		if (is_create) {
			as_index_delete(tree, &tr->keyd);
		}
		as_record_done(r_ref, tr->rsv.ns);
		urecord->flag &= ~UDF_RECORD_FLAG_OPEN;
		return 4;
	}

	// if multibin storage, we will use urecord->stack_bins, so set the size appropriately
	if ( ! rd->ns->storage_data_in_memory && ! rd->ns->single_bin ) {
		rd->n_bins = sizeof(urecord->stack_bins) / sizeof(as_bin);
	}

	// side effect: will set the unused bins to properly unused
	rd->bins       = as_bin_get_all(r, rd, urecord->stack_bins);
	urecord->flag |= UDF_RECORD_FLAG_STORAGE_OPEN;

	cf_detail(AS_UDF, "Storage Open %p %x %"PRIx64"", urecord, urecord->flag, *(uint64_t *)&tr->keyd);
	cf_detail(AS_UDF, "udf_aerospike_rec_create: Record created %d", urecord->flag);

	int rc         = udf_aerospike__execute_updates(urecord);
	if (rc) {
		//  Creating the udf record failed, destroy the as_record
		cf_warning(AS_UDF, "udf_aerospike_rec_create: failure executing record updates (%d)", rc);
		if (!as_bin_inuse_has(urecord->rd)) {
			udf_aerospike_rec_remove(as, rec);
		}
	}
	return rc;
}
Пример #28
0
// Make a request to another node.
//
// Note: there's a cheat here. 'as_msg' is used in a raw form, and includes
// structured data (version - type - nfields - sz ...) which should be made more
// wire-protocol-friendly.
int
as_proxy_divert(cf_node dst, as_transaction *tr, as_namespace *ns, uint64_t cluster_key)
{
	cf_detail(AS_PROXY, "proxy divert");

	cf_atomic_int_incr(&g_config.stat_proxy_reqs);
	if (tr->msgp && (tr->msgp->msg.info1 & AS_MSG_INFO1_XDR)) {
		cf_atomic_int_incr(&g_config.stat_proxy_reqs_xdr);
	}
	as_partition_id pid = as_partition_getid(tr->keyd);

	if (dst == 0) {
		// Get the list of replicas.
		dst = as_partition_getreplica_read(ns, pid);
	}

	// Create a fabric message, fill it out.
	msg *m = as_fabric_msg_get(M_TYPE_PROXY);
	if (!m)	{
		return -1;
	}

	uint32_t tid = cf_atomic32_incr(&g_proxy_tid);

	msg_set_uint32(m, PROXY_FIELD_OP, PROXY_OP_REQUEST);
	msg_set_uint32(m, PROXY_FIELD_TID, tid);
	msg_set_buf(m, PROXY_FIELD_DIGEST, (void *) &tr->keyd, sizeof(cf_digest), MSG_SET_COPY);
	msg_set_type msettype = tr->batch_shared ? MSG_SET_COPY : MSG_SET_HANDOFF_MALLOC;
	msg_set_buf(m, PROXY_FIELD_AS_PROTO, (void *) tr->msgp, as_proto_size_get(&tr->msgp->proto), msettype);
	msg_set_uint64(m, PROXY_FIELD_CLUSTER_KEY, cluster_key);
	msg_set_uint32(m, PROXY_FIELD_TIMEOUT_MS, tr->msgp->msg.transaction_ttl);

	tr->msgp = 0;

	cf_debug_digest(AS_PROXY, &tr->keyd, "proxy_divert: fab_msg %p dst %"PRIx64, m, dst);

	// Fill out a retransmit structure, insert into the retransmit hash.
	msg_incr_ref(m);
	proxy_request pr;
	pr.start_time = tr->start_time;
	pr.end_time = (tr->end_time != 0) ? tr->end_time : pr.start_time + g_config.transaction_max_ns;
	pr.fd_h = tr->proto_fd_h;
	tr->proto_fd_h = 0;
	pr.fab_msg = m;
	pr.xmit_ms = cf_getms() + g_config.transaction_retry_ms;
	pr.retry_interval_ms = g_config.transaction_retry_ms;
	pr.dest = dst;
	pr.pid = pid;
	pr.ns = ns;
	pr.wr = NULL;
	pr.batch_shared = tr->batch_shared;
	pr.batch_index = tr->batch_index;

	if (0 != shash_put(g_proxy_hash, &tid, &pr)) {
		cf_debug(AS_PROXY, " shash_put failed, need cleanup code");
		return -1;
	}

	// Send to the remote node.
	int rv = as_fabric_send(dst, m, AS_FABRIC_PRIORITY_MEDIUM);
	if (rv != 0) {
		cf_debug(AS_PROXY, "as_proxy_divert: returned error %d", rv);
		as_fabric_msg_put(m);
	}

	cf_atomic_int_incr(&g_config.proxy_initiate);

	return 0;
}
Пример #29
0
// Put batch request on a separate batch queue.
int
as_batch(as_transaction* tr)
{
	as_msg* msg = &tr->msgp->msg;

	as_msg_field* nsfp = as_msg_field_get(msg, AS_MSG_FIELD_TYPE_NAMESPACE);
	if (! nsfp) {
		cf_warning(AS_BATCH, "Batch namespace is required.");
		return -1;
	}

	as_msg_field* dfp = as_msg_field_get(msg, AS_MSG_FIELD_TYPE_DIGEST_RIPE_ARRAY);
	if (! dfp) {
		cf_warning(AS_BATCH, "Batch digests are required.");
		return -1;
	}

	uint n_digests = dfp->field_sz / sizeof(cf_digest);

	if (n_digests > g_config.batch_max_requests) {
		cf_warning(AS_BATCH, "Batch request size %u exceeds max %u.", n_digests, g_config.batch_max_requests);
		return -1;
	}

	batch_transaction btr;
	btr.trid = tr->trid;
	btr.end_time = tr->end_time;
	btr.get_data = !(msg->info1 & AS_MSG_INFO1_GET_NOBINDATA);

	btr.ns = as_namespace_get_bymsgfield(nsfp);
	if (! btr.ns) {
		cf_warning(AS_BATCH, "Batch namespace is required.");
		return -1;
	}

	// Create the master digest table.
	btr.digests = (batch_digests*) cf_malloc(sizeof(batch_digests) + (sizeof(batch_digest) * n_digests));
	if (! btr.digests) {
		cf_warning(AS_BATCH, "Failed to allocate memory for batch digests.");
		return -1;
	}

	batch_digests* bmd = btr.digests;
	bmd->n_digests = n_digests;
	uint8_t* digest_field_data = dfp->data;

	for (int i = 0; i < n_digests; i++) {
		bmd->digest[i].done = false;
		bmd->digest[i].node = 0;
		memcpy(&bmd->digest[i].keyd, digest_field_data, sizeof(cf_digest));
		digest_field_data += sizeof(cf_digest);
	}

	btr.binlist = as_binlist_from_op(msg);
	btr.fd_h = tr->proto_fd_h;
	tr->proto_fd_h = 0;
	btr.fd_h->last_used = cf_getms();

	cf_atomic_int_incr(&g_config.batch_initiate);
	cf_queue_push(g_batch_queue, &btr);
	return 0;
}