Example #1
0
// If there's an element with specified digest in the tree, return a locked
// and reserved reference to it in index_ref. If not, create an element with
// this digest, insert it into the tree, and return a locked and reserved
// reference to it in index_ref.
//
// Returns:
//		 1 - created and inserted (reference returned in index_ref)
//		 0 - found already existing (reference returned in index_ref)
//		-1 - error (index_ref untouched)
int
as_index_get_insert_vlock(as_index_tree *tree, cf_digest *keyd,
		as_index_ref *index_ref)
{
	int cmp = 0;
	bool retry;

	// Save parents as we search for the specified element's insertion point.
	as_index_ele eles[64];
	as_index_ele *ele;

	do {
		ele = eles;

		pthread_mutex_lock(&tree->lock);

		// Search for the specified element, or a parent to insert it under.

		ele->parent = NULL; // we'll never look this far up
		ele->me_h = tree->root_h;
		ele->me = tree->root;

		cf_arenax_handle t_h = tree->root->left_h;
		as_index *t = RESOLVE_H(t_h);

		while (t_h != tree->sentinel_h) {
			ele++;
			ele->parent = ele - 1;
			ele->me_h = t_h;
			ele->me = t;

			if ((cmp = cf_digest_compare(keyd, &t->key)) == 0) {
				// The element already exists, simply return it.

				as_index_reserve(t);
				cf_atomic_int_incr(&g_config.global_record_ref_count);

				pthread_mutex_unlock(&tree->lock);

				if (! index_ref->skip_lock) {
					olock_vlock(g_config.record_locks, keyd, &index_ref->olock);
					cf_atomic_int_incr(&g_config.global_record_lock_count);
				}

				index_ref->r = t;
				index_ref->r_h = t_h;

				return 0;
			}

			t_h = cmp > 0 ? t->left_h : t->right_h;
			t = RESOLVE_H(t_h);
		}

		// We didn't find the tree element, so we'll be inserting it.

		retry = false;

		if (EBUSY == pthread_mutex_trylock(&tree->reduce_lock)) {
			// The tree is being reduced - could take long, unlock so reads and
			// overwrites aren't blocked.
			pthread_mutex_unlock(&tree->lock);

			// Wait until the tree reduce is done...
			pthread_mutex_lock(&tree->reduce_lock);
			pthread_mutex_unlock(&tree->reduce_lock);

			// ... and start over - we unlocked, so the tree may have changed.
			retry = true;
		}
	} while (retry);

	// Create a new element and insert it.

	// Make the new element.
	cf_arenax_handle n_h = cf_arenax_alloc(tree->arena);

	if (n_h == 0) {
		cf_warning(AS_INDEX, "arenax alloc failed");
		pthread_mutex_unlock(&tree->reduce_lock);
		pthread_mutex_unlock(&tree->lock);
		return -1;
	}

	as_index *n = RESOLVE_H(n_h);

	n->rc = 2; // one for create (eventually balanced by delete), one for caller
	cf_atomic_int_add(&g_config.global_record_ref_count, 2);

	n->key = *keyd;

	n->left_h = n->right_h = tree->sentinel_h; // n starts as a leaf element
	n->color = AS_RED; // n's color starts as red

	// Make sure we can detect that the record isn't initialized.
	as_index_clear_record_info(n);

	// Insert the new element n under parent ele.
	if (ele->me == tree->root || 0 < cmp) {
		ele->me->left_h = n_h;
	}
	else {
		ele->me->right_h = n_h;
	}

	ele++;
	ele->parent = ele - 1;
	ele->me_h = n_h;
	ele->me = n;

	// Rebalance the tree as needed.
	as_index_insert_rebalance(tree, ele);

	tree->elements++;

	pthread_mutex_unlock(&tree->reduce_lock);
	pthread_mutex_unlock(&tree->lock);

	if (! index_ref->skip_lock) {
		olock_vlock(g_config.record_locks, keyd, &index_ref->olock);
		cf_atomic_int_incr(&g_config.global_record_lock_count);
	}

	index_ref->r = n;
	index_ref->r_h = n_h;

	return 1;
}
Example #2
0
// If there's an element with specified digest in the tree, delete it.
//
// Returns:
//		 0 - found and deleted
//		-1 - not found
// TODO - nobody cares about the return value, make it void?
int
as_index_delete(as_index_tree *tree, cf_digest *keyd)
{
	as_index *r;
	cf_arenax_handle r_h;
	bool retry;

	// Save parents as we search for the specified element (or its successor).
	as_index_ele eles[(64 * 2) + 3];
	as_index_ele *ele;

	do {
		ele = eles;

		pthread_mutex_lock(&tree->lock);

		ele->parent = NULL; // we'll never look this far up
		ele->me_h = tree->root_h;
		ele->me = tree->root;

		r_h = tree->root->left_h;
		r = RESOLVE_H(r_h);

		while (r_h != tree->sentinel_h) {
			ele++;
			ele->parent = ele - 1;
			ele->me_h = r_h;
			ele->me = r;

			int cmp = cf_digest_compare(keyd, &r->key);

			if (cmp == 0) {
				break; // found, we'll be deleting it
			}

			r_h = cmp > 0 ? r->left_h : r->right_h;
			r = RESOLVE_H(r_h);
		}

		if (r_h == tree->sentinel_h) {
			pthread_mutex_unlock(&tree->lock);
			return -1; // not found, nothing to delete
		}

		// We found the tree element, so we'll be deleting it.

		retry = false;

		if (EBUSY == pthread_mutex_trylock(&tree->reduce_lock)) {
			// The tree is being reduced - could take long, unlock so reads and
			// overwrites aren't blocked.
			pthread_mutex_unlock(&tree->lock);

			// Wait until the tree reduce is done...
			pthread_mutex_lock(&tree->reduce_lock);
			pthread_mutex_unlock(&tree->reduce_lock);

			// ... and start over - we unlocked, so the tree may have changed.
			retry = true;
		}
	} while (retry);

	// Delete the element.

	// Snapshot the element to delete, r. (Already have r_h and r shortcuts.)
	as_index_ele *r_e = ele;

	if (r->left_h != tree->sentinel_h && r->right_h != tree->sentinel_h) {
		// Search down for a "successor"...

		ele++;
		ele->parent = ele - 1;
		ele->me_h = r->right_h;
		ele->me = RESOLVE_H(ele->me_h);

		while (ele->me->left_h != tree->sentinel_h) {
			ele++;
			ele->parent = ele - 1;
			ele->me_h = ele->parent->me->left_h;
			ele->me = RESOLVE_H(ele->me_h);
		}
	}
	// else ele is left at r, i.e. s == r

	// Snapshot the successor, s. (Note - s could be r.)
	as_index_ele *s_e = ele;
	cf_arenax_handle s_h = s_e->me_h;
	as_index *s = s_e->me;

	// Get the appropriate child of s. (Note - child could be sentinel.)
	ele++;

	if (s->left_h == tree->sentinel_h) {
		ele->me_h = s->right_h;
	}
	else {
		ele->me_h = s->left_h;
	}

	ele->me = RESOLVE_H(ele->me_h);

	// Cut s (remember, it could be r) out of the tree.
	ele->parent = s_e->parent;

	if (s_h == s_e->parent->me->left_h) {
		s_e->parent->me->left_h = ele->me_h;
	}
	else {
		s_e->parent->me->right_h = ele->me_h;
	}

	// Rebalance at ele if necessary. (Note - if r != s, r is in the tree, and
	// its parent may change during rebalancing.)
	if (s->color == AS_BLACK) {
		as_index_delete_rebalance(tree, ele);
	}

	if (s != r) {
		// s was a successor distinct from r, put it in r's place in the tree.
		s->left_h = r->left_h;
		s->right_h = r->right_h;
		s->color = r->color;

		if (r_h == r_e->parent->me->left_h) {
			r_e->parent->me->left_h = s_h;
		}
		else {
			r_e->parent->me->right_h = s_h;
		}
	}

	// We may now destroy r, which is no longer in the tree.
	if (0 == as_index_release(r)) {
		if (tree->destructor) {
			tree->destructor(r, tree->destructor_udata);
		}

		cf_arenax_free(tree->arena, r_h);
	}

	cf_atomic_int_decr(&g_config.global_record_ref_count);

	tree->elements--;

	pthread_mutex_unlock(&tree->reduce_lock);
	pthread_mutex_unlock(&tree->lock);

	return 0;
}
/*
  The transaction prepare function fills out the fields of the tr structure,
  using the information in the msg structure. It also swaps the fields in the
  message header (which are originally in network byte order).

  Once the prepare function has been called on the transaction, the
  'preprocessed' flag is set and the transaction cannot be prepared again.
  Returns:
  0:  OK
  -1: General Error
  -2: Request received with no key
  -3: Request received with digest array
*/
int as_transaction_prepare(as_transaction *tr) {
	cl_msg *msgp = tr->msgp;
	as_msg *m = &msgp->msg;

#if defined(USE_SYSTEMTAP)
	uint64_t nodeid = g_config.self_node;
#endif

//	cf_assert(tr, AS_PROTO, CF_CRITICAL, "invalid transaction");

	void *limit = ((void *)m) + msgp->proto.sz;

	// Check processed flag.  It's a non-fatal error to call this function again.
	if( tr->preprocessed )
		return(0);

	if (0 != cf_digest_compare(&tr->keyd, &cf_digest_zero)) {
		cf_warning(AS_RW, "Internal inconsistency: transaction has keyd, but marked not swizzled");
	}

	as_msg_swap_header(m);
	if (0 != as_msg_swap_fields_and_ops(m, limit)) {
		cf_info(AS_PROTO, "msg swap field and ops returned error");
		return(-1);
	}

	if (m->n_fields>PROTO_NFIELDS_MAX_WARNING) {
		cf_info(AS_PROTO, "received too many n_fields! %d",m->n_fields);
    }

	// Set the transaction end time if available.
	if (m->transaction_ttl) {
//		cf_debug(AS_PROTO, "received non-zero transaction ttl: %d",m->transaction_ttl);
		tr->end_time = tr->start_time + ((uint64_t)m->transaction_ttl * 1000000);
	}

	// Set the preprocessed flag. All exits from here on out are considered
	// processed since the msg header has been swapped and the transaction
	// end time has been set.
	tr->preprocessed = true;
	tr->flag         = 0;
	UREQ_DATA_INIT(&tr->udata);
	as_msg_field *sfp = as_msg_field_get(m, AS_MSG_FIELD_TYPE_DIGEST_RIPE_ARRAY);
	if (sfp) {
		cf_debug(AS_PROTO, "received request with digest array, batch");
		return(-3);
	}

	// It's tempting to move this to the final return, but queries
	// actually escape in the if clause below ...
	ASD_TRANS_PREPARE(nodeid, (uint64_t) msgp, tr->trid);

	// Sent digest? Use!
	as_msg_field *dfp = as_msg_field_get(m, AS_MSG_FIELD_TYPE_DIGEST_RIPE);
	if (dfp) {
//		cf_detail(AS_PROTO, "transaction prepare: received sent digest\n");
		if (as_msg_field_get_value_sz(dfp) != sizeof(cf_digest)) {
			cf_info(AS_PROTO, "sent bad digest size %d, recomputing digest",as_msg_field_get_value_sz(dfp));
			goto Compute;
		}
		memcpy(&tr->keyd, dfp->data, sizeof(cf_digest));
	}
	// Not sent, so compute.
	else {
Compute:		;
		as_msg_field *kfp = as_msg_field_get(m, AS_MSG_FIELD_TYPE_KEY);
		if (!kfp) {
			cf_detail(AS_PROTO, "received request with no key, scan");
			return(-2);
		}
        if (as_msg_field_get_value_sz(kfp)> PROTO_FIELD_LENGTH_MAX) {
	        cf_info(AS_PROTO, "key field too big %d. Is it for real?",as_msg_field_get_value_sz(kfp));
        }

		as_msg_field *sfp = as_msg_field_get(m, AS_MSG_FIELD_TYPE_SET);
		if (sfp == 0 || as_msg_field_get_value_sz(sfp) == 0) {
			cf_digest_compute(kfp->data, as_msg_field_get_value_sz(kfp), &tr->keyd);
			// cf_info(AS_PROTO, "computing key for sz %d %"PRIx64" bytes %d %d %d %d",as_msg_field_get_value_sz(kfp),*(uint64_t *)&tr->keyd,kfp->data[0],kfp->data[1],kfp->data[2],kfp->data[3]);
		}
		else {
            if (as_msg_field_get_value_sz(sfp)> PROTO_FIELD_LENGTH_MAX) {
		        cf_info(AS_PROTO, "set field too big %d. Is this for real?",as_msg_field_get_value_sz(sfp));
            }
			cf_digest_compute2(sfp->data, as_msg_field_get_value_sz(sfp),
						kfp->data, as_msg_field_get_value_sz(kfp),
						&tr->keyd);
			// cf_info(AS_PROTO, "computing set with key for sz %d %"PRIx64" bytes %d %d",as_msg_field_get_value_sz(kfp),*(uint64_t *)&tr->keyd,kfp->data[0],kfp->data[1],kfp->data[2],kfp->data[3]);
		}
	}

	return(0);
}