Exemple #1
0
//------------------------------------------------
// Runs in every thread of every read queue, pops
// readreq objects, does the read and reports the
// read transaction duration.
//
static void* run_reads(void* pv_req_queue) {
	cf_queue* p_req_queue = (cf_queue*)pv_req_queue;
	readreq* p_readreq;

	while (g_running) {
		if (cf_queue_pop(p_req_queue, (void*)&p_readreq, 100) != CF_QUEUE_OK) {
			continue;
		}

		if (g_use_valloc) {
			uint8_t* p_buffer = cf_valloc(p_readreq->size);

			if (p_buffer) {
				read_and_report(p_readreq, p_buffer);
				free(p_buffer);
			}
			else {
				fprintf(stdout, "ERROR: read buffer cf_valloc()\n");
			}
		}
		else {
			uint8_t stack_buffer[p_readreq->size + 4096];
			uint8_t* p_buffer = align_4096(stack_buffer);

			read_and_report(p_readreq, p_buffer);
		}

		free(p_readreq);
		cf_atomic_int_decr(&g_read_reqs_queued);
	}

	return (0);
}
Exemple #2
0
/* Processing reads when they return from aio_read */
static void process_read(as_async_info_t *info)
{ 
	if(!g_running)
	{
		return;
	}
	cf_atomic_int_decr(&g_read_reqs_queued);
	uint64_t stop_time = cf_getms();
	fd_put(info->p_readreq.p_device, info->fd);
	
	if (stop_time != -1) 
	{
		histogram_insert_data_point(g_p_raw_read_histogram,
				safe_delta_ms(info->raw_start_time, stop_time));
		histogram_insert_data_point(g_p_read_histogram,
				safe_delta_ms(info->p_readreq.start_time, stop_time));
		histogram_insert_data_point(
				info->p_readreq.p_device->p_raw_read_histogram,
				safe_delta_ms(info->raw_start_time, stop_time));
	}
	if (g_use_valloc && info->p_buffer) 
	{
		free(info->p_buffer);
	}

	uintptr_t temp = (uintptr_t)info;
	cf_queue_push(async_info_queue, (void*)&temp);
}
Exemple #3
0
/*
 * Create a tree "stub" for the storage has index case.
 * Returns:  1 = new
 *           0 = success (found)
 *          -1 = fail
 */
int
as_index_ref_initialize(as_index_tree *tree, cf_digest *key, as_index_ref *index_ref, bool create_p, as_namespace *ns)
{
	/* Allocate memory for the new node and set the node parameters */
	cf_arenax_handle n_h = cf_arenax_alloc(tree->arena);
	if (0 == n_h) {
		// cf_debug(AS_INDEX," malloc failed ");
		return(-1);
	}
	as_index *n = RESOLVE_H(n_h);
	n->key = *key;
	n->rc = 1;
	n->left_h = n->right_h = tree->sentinel_h;
	n->color = AS_RED;
	n->parent_h = tree->sentinel_h;

	if (AS_STORAGE_ENGINE_KV == ns->storage_type)
		n->storage_key.kv.file_id = STORAGE_INVALID_FILE_ID; // careful here - this is now unsigned
	else
		cf_crash(AS_INDEX, "non-KV storage type ns %s key %p", ns->name, key);

	index_ref->r = n;
	index_ref->r_h = n_h;
	if (!index_ref->skip_lock) {
		olock_vlock(g_config.record_locks, key, &(index_ref->olock));
		cf_atomic_int_incr(&g_config.global_record_lock_count);
	}
	as_index_reserve(n);
	cf_atomic_int_add(&g_config.global_record_ref_count, 2);

	int rv = !as_storage_record_exists(ns, key);

	// Unlock if not found and we're not creating it.
	if (rv && !create_p) {
		if (!index_ref->skip_lock) {
			pthread_mutex_unlock(index_ref->olock);
			cf_atomic_int_decr(&g_config.global_record_lock_count);
		}
		as_index_release(n);
		cf_atomic_int_decr(&g_config.global_record_ref_count);
		cf_arenax_free(tree->arena, n_h);
		index_ref->r = 0;
		index_ref->r_h = 0;
	}

	return(rv);
}
Exemple #4
0
void
as_index_tree_purge(as_index_tree *tree, as_index *r, cf_arenax_handle r_h)
{
	// Don't purge the sentinel.
	if (r_h == tree->sentinel_h) {
		return;
	}

	as_index_tree_purge(tree, RESOLVE_H(r->left_h), r->left_h);
	as_index_tree_purge(tree, RESOLVE_H(r->right_h), r->right_h);

	if (0 == as_index_release(r)) {
		if (tree->destructor) {
			tree->destructor(r, tree->destructor_udata);
		}

		cf_arenax_free(tree->arena, r_h);
	}

	cf_atomic_int_decr(&g_config.global_record_ref_count);
}
Exemple #5
0
// If there's an element with specified digest in the tree, delete it.
//
// Returns:
//		 0 - found and deleted
//		-1 - not found
// TODO - nobody cares about the return value, make it void?
int
as_index_delete(as_index_tree *tree, cf_digest *keyd)
{
	as_index *r;
	cf_arenax_handle r_h;
	bool retry;

	// Save parents as we search for the specified element (or its successor).
	as_index_ele eles[(64 * 2) + 3];
	as_index_ele *ele;

	do {
		ele = eles;

		pthread_mutex_lock(&tree->lock);

		ele->parent = NULL; // we'll never look this far up
		ele->me_h = tree->root_h;
		ele->me = tree->root;

		r_h = tree->root->left_h;
		r = RESOLVE_H(r_h);

		while (r_h != tree->sentinel_h) {
			ele++;
			ele->parent = ele - 1;
			ele->me_h = r_h;
			ele->me = r;

			int cmp = cf_digest_compare(keyd, &r->key);

			if (cmp == 0) {
				break; // found, we'll be deleting it
			}

			r_h = cmp > 0 ? r->left_h : r->right_h;
			r = RESOLVE_H(r_h);
		}

		if (r_h == tree->sentinel_h) {
			pthread_mutex_unlock(&tree->lock);
			return -1; // not found, nothing to delete
		}

		// We found the tree element, so we'll be deleting it.

		retry = false;

		if (EBUSY == pthread_mutex_trylock(&tree->reduce_lock)) {
			// The tree is being reduced - could take long, unlock so reads and
			// overwrites aren't blocked.
			pthread_mutex_unlock(&tree->lock);

			// Wait until the tree reduce is done...
			pthread_mutex_lock(&tree->reduce_lock);
			pthread_mutex_unlock(&tree->reduce_lock);

			// ... and start over - we unlocked, so the tree may have changed.
			retry = true;
		}
	} while (retry);

	// Delete the element.

	// Snapshot the element to delete, r. (Already have r_h and r shortcuts.)
	as_index_ele *r_e = ele;

	if (r->left_h != tree->sentinel_h && r->right_h != tree->sentinel_h) {
		// Search down for a "successor"...

		ele++;
		ele->parent = ele - 1;
		ele->me_h = r->right_h;
		ele->me = RESOLVE_H(ele->me_h);

		while (ele->me->left_h != tree->sentinel_h) {
			ele++;
			ele->parent = ele - 1;
			ele->me_h = ele->parent->me->left_h;
			ele->me = RESOLVE_H(ele->me_h);
		}
	}
	// else ele is left at r, i.e. s == r

	// Snapshot the successor, s. (Note - s could be r.)
	as_index_ele *s_e = ele;
	cf_arenax_handle s_h = s_e->me_h;
	as_index *s = s_e->me;

	// Get the appropriate child of s. (Note - child could be sentinel.)
	ele++;

	if (s->left_h == tree->sentinel_h) {
		ele->me_h = s->right_h;
	}
	else {
		ele->me_h = s->left_h;
	}

	ele->me = RESOLVE_H(ele->me_h);

	// Cut s (remember, it could be r) out of the tree.
	ele->parent = s_e->parent;

	if (s_h == s_e->parent->me->left_h) {
		s_e->parent->me->left_h = ele->me_h;
	}
	else {
		s_e->parent->me->right_h = ele->me_h;
	}

	// Rebalance at ele if necessary. (Note - if r != s, r is in the tree, and
	// its parent may change during rebalancing.)
	if (s->color == AS_BLACK) {
		as_index_delete_rebalance(tree, ele);
	}

	if (s != r) {
		// s was a successor distinct from r, put it in r's place in the tree.
		s->left_h = r->left_h;
		s->right_h = r->right_h;
		s->color = r->color;

		if (r_h == r_e->parent->me->left_h) {
			r_e->parent->me->left_h = s_h;
		}
		else {
			r_e->parent->me->right_h = s_h;
		}
	}

	// We may now destroy r, which is no longer in the tree.
	if (0 == as_index_release(r)) {
		if (tree->destructor) {
			tree->destructor(r, tree->destructor_udata);
		}

		cf_arenax_free(tree->arena, r_h);
	}

	cf_atomic_int_decr(&g_config.global_record_ref_count);

	tree->elements--;

	pthread_mutex_unlock(&tree->reduce_lock);
	pthread_mutex_unlock(&tree->lock);

	return 0;
}
// Build response to batch request.
static void
batch_build_response(batch_transaction* btr, cf_buf_builder** bb_r)
{
	as_namespace* ns = btr->ns;
	batch_digests *bmds = btr->digests;
	bool get_data = btr->get_data;
	uint32_t yield_count = 0;

	for (int i = 0; i < bmds->n_digests; i++)
	{
		batch_digest *bmd = &bmds->digest[i];

		if (bmd->done == false) {
			// try to get the key
			as_partition_reservation rsv;
			AS_PARTITION_RESERVATION_INIT(rsv);
			cf_node other_node = 0;
			uint64_t cluster_key;

			if (! *bb_r) {
				*bb_r = cf_buf_builder_create_size(1024 * 4);
			}

			int rv = as_partition_reserve_read(ns, as_partition_getid(bmd->keyd), &rsv, &other_node, &cluster_key);

			if (rv == 0) {
				cf_atomic_int_incr(&g_config.batch_tree_count);

				as_index_ref r_ref;
				r_ref.skip_lock = false;
				int rec_rv = as_record_get(rsv.tree, &bmd->keyd, &r_ref, ns);

				if (rec_rv == 0) {
					as_index *r = r_ref.r;

					// Check to see this isn't an expired record waiting to die.
					if (r->void_time && r->void_time < as_record_void_time_get()) {
						as_msg_make_error_response_bufbuilder(&bmd->keyd, AS_PROTO_RESULT_FAIL_NOTFOUND, bb_r, ns->name);
					}
					else {
						// Make sure it's brought in from storage if necessary.
						as_storage_rd rd;
						if (get_data) {
							as_storage_record_open(ns, r, &rd, &r->key);
							rd.n_bins = as_bin_get_n_bins(r, &rd);
						}

						// Note: this array must stay in scope until the
						// response for this record has been built, since in the
						// get data w/ record on device case, it's copied by
						// reference directly into the record descriptor.
						as_bin stack_bins[!get_data || rd.ns->storage_data_in_memory ? 0 : rd.n_bins];

						if (get_data) {
							// Figure out which bins you want - for now, all.
							rd.bins = as_bin_get_all(r, &rd, stack_bins);
							rd.n_bins = as_bin_inuse_count(&rd);
						}

						as_msg_make_response_bufbuilder(r, (get_data ? &rd : NULL), bb_r, !get_data, (get_data ? NULL : ns->name), true, false, btr->binlist);

						if (get_data) {
							as_storage_record_close(r, &rd);
						}
					}
					as_record_done(&r_ref, ns);
				}
				else {
					// TODO - what about empty records?
					cf_debug(AS_BATCH, "batch_build_response: as_record_get returned %d : key %"PRIx64, rec_rv, *(uint64_t *)&bmd->keyd);
					as_msg_make_error_response_bufbuilder(&bmd->keyd, AS_PROTO_RESULT_FAIL_NOTFOUND, bb_r, ns->name);
				}

				bmd->done = true;

				as_partition_release(&rsv);
				cf_atomic_int_decr(&g_config.batch_tree_count);
			}
			else {
				cf_debug(AS_BATCH, "batch_build_response: partition reserve read failed: rv %d", rv);

				as_msg_make_error_response_bufbuilder(&bmd->keyd, AS_PROTO_RESULT_FAIL_NOTFOUND, bb_r, ns->name);

				if (other_node != 0) {
					bmd->node = other_node;
					cf_debug(AS_BATCH, "other_node is: %p.", other_node);
				} else {
					cf_debug(AS_BATCH, "other_node is NULL.");
				}
			}

			yield_count++;
			if (yield_count % g_config.batch_priority == 0) {
				usleep(1);
			}
		}
	}
}