/**                                                                    
 * Remove all extended keys for the given @a page from the            
 * extended key cache.                                                
 */                                                                    
static ham_status_t
my_fun_free_page_extkeys(ham_btree_t *be, ham_page_t *page, ham_u32_t flags)
{
    ham_db_t *db=be_get_db(be);
    
    ham_assert(page_get_owner(page) == db, (0));
    
    ham_assert(0 == (flags & ~DB_MOVE_TO_FREELIST), (0));

    /*
     * if this page has a header, and it's either a B-Tree root page or 
     * a B-Tree index page: remove all extended keys from the cache, 
     * and/or free their blobs
     */
    if (page_get_pers(page) 
            && (!(page_get_npers_flags(page)&PAGE_NPERS_NO_HEADER))
            && (page_get_type(page)==PAGE_TYPE_B_ROOT 
                || page_get_type(page)==PAGE_TYPE_B_INDEX)) 
    {
        ham_size_t i;
        ham_offset_t blobid;
        int_key_t *bte;
        btree_node_t *node=ham_page_get_btree_node(page);
        extkey_cache_t *c;

        ham_assert(db, ("Must be set as page owner when this is a Btree page"));
        ham_assert(db=page_get_owner(page), (""));
        c=db_get_extkey_cache(db);

        for (i=0; i<btree_node_get_count(node); i++) 
        {
            bte=btree_node_get_key(db, node, i);
            if (key_get_flags(bte)&KEY_IS_EXTENDED) 
            {
                blobid=key_get_extended_rid(db, bte);
                if (env_get_rt_flags(db_get_env(db))&HAM_IN_MEMORY_DB) 
                {
                    /* delete the blobid to prevent that it's freed twice */
                    *(ham_offset_t *)(key_get_key(bte)+
                        (db_get_keysize(db)-sizeof(ham_offset_t)))=0;
                }
                //(void)key_erase_record(db, bte, 0, BLOB_FREE_ALL_DUPES);
                if (c)
                    (void)extkey_cache_remove(c, blobid);
            }
        }
    }

    return (HAM_SUCCESS);
}
ham_status_t
btree_traverse_tree(ham_page_t **page_ref, ham_s32_t *idxptr, 
                    ham_db_t *db, ham_page_t *page, ham_key_t *key)
{
    ham_status_t st;
    ham_s32_t slot;
    int_key_t *bte;
    btree_node_t *node=ham_page_get_btree_node(page);

    /*
     * make sure that we're not in a leaf page, and that the 
     * page is not empty
     */
    ham_assert(btree_node_get_count(node)>0, (0));
    ham_assert(btree_node_get_ptr_left(node)!=0, (0));

    st=btree_get_slot(db, page, key, &slot, 0);
    if (st)
    {
        *page_ref = 0;
        return st;
    }

    if (idxptr)
        *idxptr=slot;

    if (slot==-1)
    {
        st = db_fetch_page(page_ref, db, btree_node_get_ptr_left(node), 0);
        ham_assert(st ? !*page_ref : 1, (0));
        return st;
    }
    else 
    {
        bte=btree_node_get_key(db, node, slot);
        ham_assert(key_get_flags(bte)==0 || 
                key_get_flags(bte)==KEY_IS_EXTENDED,
                ("invalid key flags 0x%x", key_get_flags(bte)));
        st = db_fetch_page(page_ref, db, key_get_ptr(bte), 0);
        ham_assert(st ? !*page_ref : 1, (0));
        return st;
    }
}
ham_status_t 
btree_find_cursor(ham_btree_t *be, ham_bt_cursor_t *cursor, 
           ham_key_t *key, ham_record_t *record, ham_u32_t flags)
{
	ham_status_t st;
    ham_page_t *page = NULL;
    btree_node_t *node = NULL;
    int_key_t *entry;
    ham_s32_t idx = -1;
    ham_db_t *db=be_get_db(be);
    find_hints_t hints = {flags, flags, 0, HAM_FALSE, HAM_FALSE, 1};

    btree_find_get_hints(&hints, db, key);

    if (hints.key_is_out_of_bounds) {
        stats_update_find_fail_oob(db, &hints);
        return HAM_KEY_NOT_FOUND;
    }

    if (hints.try_fast_track) {
        /* 
         * see if we get a sure hit within this btree leaf; if not, revert to 
         * regular scan 
         *
         * As this is a speed-improvement hint re-using recent material, the 
         * page should still sit in the cache, or we're using old info, which 
         * should be discarded.
         */
        st = db_fetch_page(&page, db, hints.leaf_page_addr, DB_ONLY_FROM_CACHE);
		ham_assert(st ? !page : 1, (0));
		if (st)
			return st;
        if (page) {
            node=ham_page_get_btree_node(page);
            ham_assert(btree_node_is_leaf(node), (0));
			ham_assert(btree_node_get_count(node) >= 3, (0)); /* edges + middle match */

            idx = btree_node_search_by_key(db, page, key, hints.flags);
            /* 
             * if we didn't hit a match OR a match at either edge, FAIL.
             * A match at one of the edges is very risky, as this can also 
             * signal a match far away from the current node, so we need 
             * the full tree traversal then.
             */
            if (idx <= 0 || idx >= btree_node_get_count(node) - 1) {
                idx = -1;
            }
            /*
             * else: we landed in the middle of the node, so we don't need to
             * traverse the entire tree now.
             */
        }

        /* Reset any errors which may have been collected during the hinting 
         * phase -- this is done by setting 'idx = -1' above as that effectively 
		 * clears the possible error code stored in there when (idx < -1) 
		 */
    }

    if (idx == -1) {
        /* get the address of the root page */
        if (!btree_get_rootpage(be)) {
            stats_update_find_fail(db, &hints);
            return HAM_KEY_NOT_FOUND;
        }

        /* load the root page */
        st=db_fetch_page(&page, db, btree_get_rootpage(be), 0);
		ham_assert(st ? !page : 1, (0));
        if (!page) {
            ham_assert(st, (0));
            stats_update_find_fail(db, &hints);
			return st ? st : HAM_INTERNAL_ERROR;
        }

        /* now traverse the root to the leaf nodes, till we find a leaf */
        node=ham_page_get_btree_node(page);
        if (!btree_node_is_leaf(node)) {
            /* signal 'don't care' when we have multiple pages; we resolve 
               this once we've got a hit further down */
            if (hints.flags & (HAM_FIND_LT_MATCH | HAM_FIND_GT_MATCH)) 
                hints.flags |= (HAM_FIND_LT_MATCH | HAM_FIND_GT_MATCH);

            for (;;) {
                hints.cost++;
                st=btree_traverse_tree(&page, 0, db, page, key);
                if (!page) {
                    stats_update_find_fail(db, &hints);
					return st ? st : HAM_KEY_NOT_FOUND;
                }

                node=ham_page_get_btree_node(page);
                if (btree_node_is_leaf(node))
                    break;
            }
        }

        /* check the leaf page for the key */
        idx=btree_node_search_by_key(db, page, key, hints.flags);
        if (idx < -1) {
            stats_update_find_fail(db, &hints);
            return (ham_status_t)idx;
        }
    }  /* end of regular search */

    /*
     * When we are performing an approximate match, the worst case
     * scenario is where we've picked the wrong side of the fence
     * while sitting at a page/node boundary: that's what this
     * next piece of code resolves:
     *
     * essentially it moves one record forwards or backward when
     * the flags tell us this is mandatory and we're not yet in the proper
     * position yet.
     *
     * The whole trick works, because the code above detects when
     * we need to traverse a multi-page btree -- where this worst-case
     * scenario can happen -- and adjusted the flags to accept
     * both LT and GT approximate matches so that btree_node_search_by_key()
     * will be hard pressed to return a 'key not found' signal (idx==-1),
     * instead delivering the nearest LT or GT match; all we need to
     * do now is ensure we've got the right one and if not, 
     * shift by one.
     */
    if (idx >= 0) {
        if ((ham_key_get_intflags(key) & KEY_IS_APPROXIMATE) 
            && (hints.original_flags 
                    & (HAM_FIND_LT_MATCH | HAM_FIND_GT_MATCH)) 
                != (HAM_FIND_LT_MATCH | HAM_FIND_GT_MATCH)) {
            if ((ham_key_get_intflags(key) & KEY_IS_GT) 
                && (hints.original_flags & HAM_FIND_LT_MATCH)) {
                /*
                 * if the index-1 is still in the page, just decrement the
                 * index
                 */
                if (idx > 0) {
                    idx--;
                }
                else {
                    /*
                     * otherwise load the left sibling page
                     */
                    if (!btree_node_get_left(node)) {
                        stats_update_find_fail(db, &hints);
                        ham_assert(node == ham_page_get_btree_node(page), (0));
                        stats_update_any_bound(db, page, key, hints.original_flags, -1);
                        return HAM_KEY_NOT_FOUND;
                    }

                    hints.cost++;
                    st = db_fetch_page(&page, db, btree_node_get_left(node), 0);
					ham_assert(st ? !page : 1, (0));
                    if (!page) {
                        ham_assert(st, (0));
                        stats_update_find_fail(db, &hints);
						return st ? st : HAM_INTERNAL_ERROR;
                    }
                    node = ham_page_get_btree_node(page);
                    idx = btree_node_get_count(node) - 1;
                }
                ham_key_set_intflags(key, (ham_key_get_intflags(key) 
                        & ~KEY_IS_APPROXIMATE) | KEY_IS_LT);
            }
            else if ((ham_key_get_intflags(key) & KEY_IS_LT) 
                    && (hints.original_flags & HAM_FIND_GT_MATCH)) {
                /*
                 * if the index+1 is still in the page, just increment the
                 * index
                 */
                if (idx + 1 < btree_node_get_count(node)) {
                    idx++;
                }
                else {
                    /*
                     * otherwise load the right sibling page
                     */
                    if (!btree_node_get_right(node))
                    {
                        stats_update_find_fail(db, &hints);
                        ham_assert(node == ham_page_get_btree_node(page), (0));
                        stats_update_any_bound(db, page, key, hints.original_flags, -1);
                        return HAM_KEY_NOT_FOUND;
                    }

                    hints.cost++;
                    st = db_fetch_page(&page, db, 
                                    btree_node_get_right(node), 0);
                    if (!page) {
                        ham_assert(st, (0));
                        stats_update_find_fail(db, &hints);
						return st ? st : HAM_INTERNAL_ERROR;
                    }
                    node = ham_page_get_btree_node(page);
                    idx = 0;
                }
                ham_key_set_intflags(key, (ham_key_get_intflags(key) 
                        & ~KEY_IS_APPROXIMATE) | KEY_IS_GT);
            }
        }
        else if (!(ham_key_get_intflags(key) & KEY_IS_APPROXIMATE) 
                && !(hints.original_flags & HAM_FIND_EXACT_MATCH) 
                && (hints.original_flags != 0)) {
            /* 
             * 'true GT/LT' has been added @ 2009/07/18 to complete 
             * the EQ/LEQ/GEQ/LT/GT functionality;
             *
             * 'true LT/GT' is simply an extension upon the already existing 
             * LEQ/GEQ logic just above; all we do here is move one record 
             * up/down as it just happens that we get an exact ('equal') 
             * match here.
             *
             * The fact that the LT/GT constants share their bits with the 
             * LEQ/GEQ flags so that LEQ==(LT|EXACT) and GEQ==(GT|EXACT) 
             * ensures that we can restrict our work to a simple adjustment 
             * right here; everything else has already been taken of by the 
             * LEQ/GEQ logic in the section above when the key has been 
             * flagged with the KEY_IS_APPROXIMATE flag.
             */
            if (hints.original_flags & HAM_FIND_LT_MATCH)
            {
                /*
                 * if the index-1 is still in the page, just decrement the
                 * index
                 */
                if (idx > 0)
                {
                    idx--;

                    ham_key_set_intflags(key, (ham_key_get_intflags(key) 
                            & ~KEY_IS_APPROXIMATE) | KEY_IS_LT);
                }
                else
                {
                    /*
                     * otherwise load the left sibling page
                     */
                    if (!btree_node_get_left(node))
                    {
                        /* when an error is otherwise unavoidable, see if 
                           we have an escape route through GT? */

                        if (hints.original_flags & HAM_FIND_GT_MATCH)
                        {
                            /*
                             * if the index+1 is still in the page, just 
                             * increment the index
                             */
                            if (idx + 1 < btree_node_get_count(node))
                            {
                                idx++;
                            }
                            else
                            {
                                /*
                                 * otherwise load the right sibling page
                                 */
                                if (!btree_node_get_right(node))
                                {
                                    stats_update_find_fail(db, &hints);
                                    ham_assert(node == ham_page_get_btree_node(page), (0));
                                    stats_update_any_bound(db, page, key, hints.original_flags, -1);
                                    return HAM_KEY_NOT_FOUND;
                                }

                                hints.cost++;
                                st = db_fetch_page(&page, db,
                                                btree_node_get_right(node), 0);
                                if (!page)
                                {
                                    ham_assert(st, (0));
                                    stats_update_find_fail(db, &hints);
									return st ? st : HAM_INTERNAL_ERROR;
                                }
                                node = ham_page_get_btree_node(page);
                                idx = 0;
                            }
                            ham_key_set_intflags(key, (ham_key_get_intflags(key) & 
                                            ~KEY_IS_APPROXIMATE) | KEY_IS_GT);
                        }
                        else
                        {
                            stats_update_find_fail(db, &hints);
                            ham_assert(node == ham_page_get_btree_node(page), (0));
                            stats_update_any_bound(db, page, key, hints.original_flags, -1);
                            return HAM_KEY_NOT_FOUND;
                        }
                    }
                    else
                    {
                        hints.cost++;
                        st = db_fetch_page(&page, db,
                                        btree_node_get_left(node), 0);
                        if (!page)
                        {
                            ham_assert(st, (0));
                            stats_update_find_fail(db, &hints);
							return st ? st : HAM_INTERNAL_ERROR;
                        }
                        node = ham_page_get_btree_node(page);
                        idx = btree_node_get_count(node) - 1;

                        ham_key_set_intflags(key, (ham_key_get_intflags(key) 
                                        & ~KEY_IS_APPROXIMATE) | KEY_IS_LT);
                    }
                }
            }
            else if (hints.original_flags & HAM_FIND_GT_MATCH)
            {
                /*
                 * if the index+1 is still in the page, just increment the
                 * index
                 */
                if (idx + 1 < btree_node_get_count(node))
                {
                    idx++;
                }
                else
                {
                    /*
                     * otherwise load the right sibling page
                     */
                    if (!btree_node_get_right(node))
                    {
                        stats_update_find_fail(db, &hints);
                        ham_assert(node == ham_page_get_btree_node(page), (0));
                        stats_update_any_bound(db, page, key, hints.original_flags, -1);
                        return HAM_KEY_NOT_FOUND;
                    }

                    hints.cost++;
                    st = db_fetch_page(&page, db, 
                                btree_node_get_right(node), 0);
                    if (!page)
                    {
                        ham_assert(st, (0));
                        stats_update_find_fail(db, &hints);
						return st ? st : HAM_INTERNAL_ERROR;
                    }
                    node = ham_page_get_btree_node(page);
                    idx = 0;
                }
                ham_key_set_intflags(key, (ham_key_get_intflags(key) 
                                        & ~KEY_IS_APPROXIMATE) | KEY_IS_GT);
            }
        }
    }

    if (idx<0) {
        stats_update_find_fail(db, &hints);
        ham_assert(node, (0));
        ham_assert(page, (0));
        ham_assert(node == ham_page_get_btree_node(page), (0));
        stats_update_any_bound(db, page, key, hints.original_flags, -1);
        return HAM_KEY_NOT_FOUND;
    }

    /* load the entry, and store record ID and key flags */
    entry=btree_node_get_key(db, node, idx);

    /* set the cursor-position to this key */
    if (cursor) {
        ham_assert(!(bt_cursor_get_flags(cursor)&BT_CURSOR_FLAG_UNCOUPLED), 
                ("coupling an uncoupled cursor, but need a nil-cursor"));
        ham_assert(!(bt_cursor_get_flags(cursor)&BT_CURSOR_FLAG_COUPLED), 
                ("coupling a coupled cursor, but need a nil-cursor"));
        page_add_cursor(page, (ham_cursor_t *)cursor);
        bt_cursor_set_flags(cursor, 
                bt_cursor_get_flags(cursor)|BT_CURSOR_FLAG_COUPLED);
        bt_cursor_set_coupled_page(cursor, page);
        bt_cursor_set_coupled_index(cursor, idx);
    }

    /*
     * during util_read_key and util_read_record, new pages might be needed,
     * and the page at which we're pointing could be moved out of memory; 
     * that would mean that the cursor would be uncoupled, and we're losing
     * the 'entry'-pointer. therefore we 'lock' the page by incrementing 
     * the reference counter
     */
    page_add_ref(page);
    ham_assert(btree_node_is_leaf(node), ("iterator points to internal node"));

    /* no need to load the key if we have an exact match: */
    if (key && (ham_key_get_intflags(key) & KEY_IS_APPROXIMATE)) 
    {
        ham_status_t st=util_read_key(db, entry, key);
        if (st) 
        {
            page_release_ref(page);
            stats_update_find_fail(db, &hints);
            return (st);
        }
    }

    if (record) 
    {
        ham_status_t st;
        record->_intflags=key_get_flags(entry);
        record->_rid=key_get_ptr(entry);
        st=util_read_record(db, record, flags);
        if (st) 
        {
            page_release_ref(page);
            stats_update_find_fail(db, &hints);
            return (st);
        }
    }

    page_release_ref(page);
    
    stats_update_find(db, page, &hints);
    ham_assert(node == ham_page_get_btree_node(page), (0));
    stats_update_any_bound(db, page, key, hints.original_flags, idx);

    return (0);
}
void 
stats_update_any_bound(ham_db_t *db, struct ham_page_t *page, ham_key_t *key, ham_u32_t find_flags, ham_s32_t slot)
{
    ham_status_t st;
    ham_runtime_statistics_dbdata_t *dbdata = db_get_db_perf_data(db);
    ham_env_t *env = db_get_env(db);
    btree_node_t *node = ham_page_get_btree_node(page);

    ham_assert(env_get_allocator(env) != 0, (0));
    ham_assert(btree_node_is_leaf(node), (0));
    if (!btree_node_get_left(node))
    {
        /* this is the leaf page which carries the lower bound key */
        ham_assert(btree_node_get_count(node) == 0 ? !btree_node_get_right(node) : 1, (0));
        if (btree_node_get_count(node) == 0)
        {
            /* range is empty 
             *
             * do not set the lower/upper boundary; otherwise we may trigger
             * a key comparison with an empty key, and the comparison function
             * could not be fit to handle this.

             EDIT: the code should be able to handle that particular situation
                   as this was tested a while ago. Besides, the settings here
                   are a signal for the hinter the table is currently 
                   completely empty and no btree traversal whatsoever is 
                   needed before we find, insert or erase.

             EDIT #2: custom compare routines may b0rk on the data NULL pointers
                   (the monster test comparison function does, for example),
                   so the smarter thing to do here is NOT set the bounds here.

                   The trouble with that approach is that the hinter no longer
                   'knows about' an empty table, but is that so bad? An empty
                   table would constitute only a btree root node anyway, so the
                   regular traversal would be quick anyhow.
             */
            if (dbdata->lower_bound_index != 1
                || dbdata->upper_bound_index != 0)
            {
                /* only set when not done already */
                if (dbdata->lower_bound.data)
                    allocator_free(env_get_allocator(env), dbdata->lower_bound.data);
                if (dbdata->upper_bound.data)
                    allocator_free(env_get_allocator(env), dbdata->upper_bound.data);
                memset(&dbdata->lower_bound, 0, sizeof(dbdata->lower_bound));
                memset(&dbdata->upper_bound, 0, sizeof(dbdata->upper_bound));
                dbdata->lower_bound_index = 1; /* impossible value for lower bound index */
                dbdata->upper_bound_index = 0;
                dbdata->lower_bound_page_address = page_get_self(page);
                dbdata->upper_bound_page_address = 0; /* page_get_self(page); */
                dbdata->lower_bound_set = HAM_TRUE;
                dbdata->upper_bound_set = HAM_FALSE; /* cannot be TRUE or subsequent updates for single record carrying tables may fail */
                //ham_assert(dbdata->lower_bound.data != NULL, (0));
                ham_assert(dbdata->lower_bound_page_address != 0, (0));
            }
        }
        else
        {
            /*
            lower bound key is always located at index [0]

            update our key info when either our current data is undefined (startup condition)
            or the first key was edited in some way (slot == 0). This 'copy anyway' approach 
            saves us one costly key comparison.
            */
            if (dbdata->lower_bound_index != 0
                || dbdata->lower_bound_page_address != page_get_self(page)
                || slot == 0)
            {
                page_add_ref(page);

                /* only set when not done already */
                dbdata->lower_bound_set = HAM_TRUE;
                dbdata->lower_bound_index = 0;
                dbdata->lower_bound_page_address = page_get_self(page);

                if (dbdata->lower_bound.data) {
                    allocator_free(env_get_allocator(env), dbdata->lower_bound.data);
                    dbdata->lower_bound.data=0;
                    dbdata->lower_bound.size=0;
                }

                st = util_copy_key_int2pub(db, 
                    btree_node_get_key(db, node, dbdata->lower_bound_index),
                    &dbdata->lower_bound);
                if (st) 
                {
                    /* panic! is case of failure, just drop the lower bound 
                     * entirely. */
                    if (dbdata->lower_bound.data)
                        allocator_free(env_get_allocator(env), dbdata->lower_bound.data);
                    memset(&dbdata->lower_bound, 0, 
                            sizeof(dbdata->lower_bound));
                    dbdata->lower_bound_index = 0;
                    dbdata->lower_bound_page_address = 0;
                    dbdata->lower_bound_set = HAM_FALSE;
                }
                else
                {
                    ham_assert(dbdata->lower_bound.data == NULL ?
                        dbdata->lower_bound.size == 0 : 
                        dbdata->lower_bound.size > 0, (0));
                    ham_assert(dbdata->lower_bound_page_address != 0, (0));
                }
                page_release_ref(page);
            }
        }
    }

    if (!btree_node_get_right(node)) 
    {
        /* this is the leaf page which carries the upper bound key */
        ham_assert(btree_node_get_count(node) == 0 
                ? !btree_node_get_left(node) 
                : 1, (0));
        if (btree_node_get_count(node) != 0) 
        {
            /* 
             * range is non-empty; the other case has already been handled 
             * above upper bound key is always located at index [size-1] 
             * update our key info when either our current data is 
             * undefined (startup condition) or the last key was edited in 
             * some way (slot == size-1). This 'copy anyway' approach 
             * saves us one costly key comparison.
             */
            if (dbdata->upper_bound_index != btree_node_get_count(node) - 1
                    || dbdata->upper_bound_page_address != page_get_self(page)
                    || slot == btree_node_get_count(node) - 1) 
            {
                page_add_ref(page);

                /* only set when not done already */
                dbdata->upper_bound_set = HAM_TRUE;
                dbdata->upper_bound_index = btree_node_get_count(node) - 1;
                dbdata->upper_bound_page_address = page_get_self(page);

                if (dbdata->upper_bound.data) {
                    allocator_free(env_get_allocator(env), dbdata->upper_bound.data);
                    dbdata->upper_bound.data=0;
                    dbdata->upper_bound.size=0;
                }

                st = util_copy_key_int2pub(db, 
                    btree_node_get_key(db, node, dbdata->upper_bound_index),
                    &dbdata->upper_bound);
                if (st) 
                {
                    /* panic! is case of failure, just drop the upper bound 
                     * entirely. */
                    if (dbdata->upper_bound.data)
                        allocator_free(env_get_allocator(env), dbdata->upper_bound.data);
                    memset(&dbdata->upper_bound, 0, 
                            sizeof(dbdata->upper_bound));
                    dbdata->upper_bound_index = 0;
                    dbdata->upper_bound_page_address = 0;
                    dbdata->upper_bound_set = HAM_FALSE;
                }
                page_release_ref(page);
            }
        }
    }
}
Beispiel #5
0
static ham_status_t
__insert_split(ham_page_t *page, ham_key_t *key, 
        ham_offset_t rid, insert_scratchpad_t *scratchpad, 
        insert_hints_t *hints)
{
    int cmp;
    ham_status_t st;
    ham_page_t *newpage, *oldsib;
    int_key_t *nbte, *obte;
    btree_node_t *nbtp, *obtp, *sbtp;
    ham_size_t count, keysize;
    ham_db_t *db=page_get_owner(page);
    ham_env_t *env = db_get_env(db);
    ham_key_t pivotkey, oldkey;
    ham_offset_t pivotrid;
    ham_u16_t pivot;
    ham_bool_t pivot_at_end=HAM_FALSE;

    ham_assert(page_get_owner(page), (0));
    ham_assert(device_get_env(page_get_device(page)) 
            == db_get_env(page_get_owner(page)), (0));

    ham_assert(hints->force_append == HAM_FALSE, (0));

    keysize=db_get_keysize(db);

    /*
     * allocate a new page
     */
    hints->cost++;
    st=db_alloc_page(&newpage, db, PAGE_TYPE_B_INDEX, 0); 
    ham_assert(st ? page == NULL : 1, (0));
    ham_assert(!st ? page  != NULL : 1, (0));
    if (st)
        return st; 
    ham_assert(page_get_owner(newpage), (""));
    /* clear the node header */
    memset(page_get_payload(newpage), 0, sizeof(btree_node_t));

    stats_page_is_nuked(db, page, HAM_TRUE);

    /*
     * move half of the key/rid-tuples to the new page
     *
     * !! recno: keys are sorted; we do a "lazy split"
     */
    nbtp=ham_page_get_btree_node(newpage);
    nbte=btree_node_get_key(db, nbtp, 0);
    obtp=ham_page_get_btree_node(page);
    obte=btree_node_get_key(db, obtp, 0);
    count=btree_node_get_count(obtp);

    /*
     * for databases with sequential access (this includes recno databases):
     * do not split in the middle, but at the very end of the page
     *
     * if this page is the right-most page in the index, and this key is 
     * inserted at the very end, then we select the same pivot as for
     * sequential access
     */
    if (db_get_data_access_mode(db)&HAM_DAM_SEQUENTIAL_INSERT)
        pivot_at_end=HAM_TRUE;
    else if (btree_node_get_right(obtp)==0) {
        cmp=key_compare_pub_to_int(db, page, key, btree_node_get_count(obtp)-1);
        if (cmp>0)
            pivot_at_end=HAM_TRUE;
    }

    /*
     * internal pages set the count of the new page to count-pivot-1 (because
     * the pivot element will become ptr_left of the new page).
     * by using pivot=count-2 we make sure that at least 1 element will remain
     * in the new node.
     */
    if (pivot_at_end) {
        pivot=count-2;
    }
    else {
        pivot=count/2;
    }

    /*
     * uncouple all cursors
     */
    st=bt_uncouple_all_cursors(page, pivot);
    if (st)
        return (st);

    /*
     * if we split a leaf, we'll insert the pivot element in the leaf
     * page, too. in internal nodes, we don't insert it, but propagate
     * it to the parent node only.
     */
    if (btree_node_is_leaf(obtp)) {
        hints->cost += stats_memmove_cost((db_get_int_key_header_size()+keysize)*(count-pivot));
        memcpy((char *)nbte,
               ((char *)obte)+(db_get_int_key_header_size()+keysize)*pivot, 
               (db_get_int_key_header_size()+keysize)*(count-pivot));
    }
    else {
        hints->cost += stats_memmove_cost((db_get_int_key_header_size()+keysize)*(count-pivot-1));
        memcpy((char *)nbte,
               ((char *)obte)+(db_get_int_key_header_size()+keysize)*(pivot+1), 
               (db_get_int_key_header_size()+keysize)*(count-pivot-1));
    }
    
    /* 
     * store the pivot element, we'll need it later to propagate it 
     * to the parent page
     */
    nbte=btree_node_get_key(db, obtp, pivot);

    memset(&pivotkey, 0, sizeof(pivotkey));
    memset(&oldkey, 0, sizeof(oldkey));
    oldkey.data=key_get_key(nbte);
    oldkey.size=key_get_size(nbte);
    oldkey._flags=key_get_flags(nbte);
    st = util_copy_key(db, &oldkey, &pivotkey);
    if (st) 
    {
        (void)db_free_page(newpage, DB_MOVE_TO_FREELIST);
        goto fail_dramatically;
    }
    pivotrid=page_get_self(newpage);

    /*
     * adjust the page count
     */
    if (btree_node_is_leaf(obtp)) {
        btree_node_set_count(obtp, pivot);
        btree_node_set_count(nbtp, count-pivot);
    }
    else {
        btree_node_set_count(obtp, pivot);
        btree_node_set_count(nbtp, count-pivot-1);
    }

    /*
     * if we're in an internal page: fix the ptr_left of the new page
     * (it points to the ptr of the pivot key)
     */ 
    if (!btree_node_is_leaf(obtp)) {
        /* 
         * nbte still contains the pivot key 
         */
        btree_node_set_ptr_left(nbtp, key_get_ptr(nbte));
    }

    /*
     * insert the new element
     */
    hints->cost++;
    cmp=key_compare_pub_to_int(db, page, key, pivot);
    if (cmp < -1) 
    {
        st = (ham_status_t)cmp;
        goto fail_dramatically;
    }

    if (cmp>=0)
        st=__insert_nosplit(newpage, key, rid, 
                scratchpad->record, scratchpad->cursor, hints);
    else
        st=__insert_nosplit(page, key, rid, 
                scratchpad->record, scratchpad->cursor, hints);
    if (st) 
    {
        goto fail_dramatically;
    }
    scratchpad->cursor=0; /* don't overwrite cursor if __insert_nosplit
                             is called again */

    /*
     * fix the double-linked list of pages, and mark the pages as dirty
     */
    if (btree_node_get_right(obtp)) 
    {
        st=db_fetch_page(&oldsib, db, btree_node_get_right(obtp), 0);
        if (st)
            goto fail_dramatically;
    }
    else
    {
        oldsib=0;
    }

    if (oldsib) {
        st=ham_log_add_page_before(oldsib);
        if (st)
            goto fail_dramatically;
    }

    btree_node_set_left (nbtp, page_get_self(page));
    btree_node_set_right(nbtp, btree_node_get_right(obtp));
    btree_node_set_right(obtp, page_get_self(newpage));
    if (oldsib) {
        sbtp=ham_page_get_btree_node(oldsib);
        btree_node_set_left(sbtp, page_get_self(newpage));
        page_set_dirty(oldsib, env);
    }
    page_set_dirty(newpage, env);
    page_set_dirty(page, env);

    /* 
     * propagate the pivot key to the parent page
     */
    ham_assert(!(scratchpad->key.flags & HAM_KEY_USER_ALLOC), (0));
    if (scratchpad->key.data)
        allocator_free(env_get_allocator(env), scratchpad->key.data);
    scratchpad->key=pivotkey;
    scratchpad->rid=pivotrid;
    ham_assert(!(scratchpad->key.flags & HAM_KEY_USER_ALLOC), (0));

    return (SPLIT);

fail_dramatically:
    ham_assert(!(pivotkey.flags & HAM_KEY_USER_ALLOC), (0));
    if (pivotkey.data)
        allocator_free(env_get_allocator(env), pivotkey.data);
    return st;
}
Beispiel #6
0
static ham_status_t
__insert_nosplit(ham_page_t *page, ham_key_t *key, 
        ham_offset_t rid, ham_record_t *record, 
        ham_bt_cursor_t *cursor, insert_hints_t *hints)
{
    ham_status_t st;
    ham_u16_t count;
    ham_size_t keysize;
    ham_size_t new_dupe_id = 0;
    int_key_t *bte = 0;
    btree_node_t *node;
    ham_db_t *db=page_get_owner(page);
    ham_bool_t exists = HAM_FALSE;
    ham_s32_t slot;

    ham_assert(page_get_owner(page), (0));
    ham_assert(device_get_env(page_get_device(page)) == db_get_env(page_get_owner(page)), (0));

    node=ham_page_get_btree_node(page);
    count=btree_node_get_count(node);
    keysize=db_get_keysize(db);

    if (btree_node_get_count(node)==0)
    {
        slot = 0;
    }
    else if (hints->force_append) 
    {
        slot = count;
    } 
    else if (hints->force_prepend) 
    {
        /* insert at beginning; shift all up by one */
        slot = 0;
    } 
    else 
    {
        int cmp;

        hints->cost++;
        st=btree_get_slot(db, page, key, &slot, &cmp);
        if (st)
            return (st);

        /* insert the new key at the beginning? */
        if (slot == -1) 
        {
            slot = 0;
        }
        else
        {
            /*
             * key exists already
             */
            if (cmp == 0) 
            {
                if (hints->flags & HAM_OVERWRITE) 
                {
                    /* 
                     * no need to overwrite the key - it already exists! 
                     * however, we have to overwrite the data!
                     */
                    if (!btree_node_is_leaf(node)) 
                        return (HAM_SUCCESS);
                }
                else if (!(hints->flags & HAM_DUPLICATE))
                    return (HAM_DUPLICATE_KEY);

                /* do NOT shift keys up to make room; just overwrite the current [slot] */
                exists = HAM_TRUE;
            }
            else 
            {
                /*
                 * otherwise, if the new key is > then the slot key, move to 
                 * the next slot
                 */
                if (cmp > 0) 
                {
                    slot++;
                }
            }
        }
    }

    /*
     * in any case, uncouple the cursors and see if we must shift any elements to the
     * right
     */
    bte=btree_node_get_key(db, node, slot);
    ham_assert(bte, (0));

    if (!exists)
    {
        if (count > slot)
        {
            /* uncouple all cursors & shift any elements following [slot] */
            st=bt_uncouple_all_cursors(page, slot);
            if (st)
                return (st);

            hints->cost += stats_memmove_cost((db_get_int_key_header_size()+keysize)*(count-slot));
            memmove(((char *)bte)+db_get_int_key_header_size()+keysize, bte,
                    (db_get_int_key_header_size()+keysize)*(count-slot));
        }

        /*
         * if a new key is created or inserted: initialize it with zeroes
         */
        memset(bte, 0, db_get_int_key_header_size()+keysize);
    }

    /*
     * if we're in the leaf: insert, overwrite or append the blob
     * (depends on the flags)
     */
    if (btree_node_is_leaf(node)) {
        ham_status_t st;

        hints->cost++;
        st=key_set_record(db, bte, record, 
                        cursor
                            ? bt_cursor_get_dupe_id(cursor)
                            : 0, 
                        hints->flags, &new_dupe_id);
        if (st)
            return (st);
        
        hints->processed_leaf_page = page;
        hints->processed_slot = slot;
    }
    else {
        key_set_ptr(bte, rid);
    }

    page_set_dirty(page, db_get_env(db));
    key_set_size(bte, key->size);

    /*
     * set a flag if the key is extended, and does not fit into the 
     * btree
     */
    if (key->size > db_get_keysize(db))
        key_set_flags(bte, key_get_flags(bte)|KEY_IS_EXTENDED);

    /*
     * if we have a cursor: couple it to the new key
     *
     * the cursor always points to NIL.
     */
    if (cursor) 
    {
        if ((st=bt_cursor_set_to_nil(cursor)))
            return (st);

        ham_assert(!(bt_cursor_get_flags(cursor)&BT_CURSOR_FLAG_UNCOUPLED), 
                ("coupling an uncoupled cursor, but need a nil-cursor"));
        ham_assert(!(bt_cursor_get_flags(cursor)&BT_CURSOR_FLAG_COUPLED), 
                ("coupling a coupled cursor, but need a nil-cursor"));
        bt_cursor_set_flags(cursor, 
                bt_cursor_get_flags(cursor)|BT_CURSOR_FLAG_COUPLED);
        bt_cursor_set_coupled_page(cursor, page);
        bt_cursor_set_coupled_index(cursor, slot);
        bt_cursor_set_dupe_id(cursor, new_dupe_id);
        memset(bt_cursor_get_dupe_cache(cursor), 0, sizeof(dupe_entry_t));
        page_add_cursor(page, (ham_cursor_t *)cursor);
    }

    /*
     * if we've overwritten a key: no need to continue, we're done
     */
    if (exists)
        return (0);

    /*
     * we insert the extended key, if necessary
     */
    key_set_key(bte, key->data, 
            db_get_keysize(db) < key->size ? db_get_keysize(db) : key->size);

    /*
     * if we need an extended key, allocate a blob and store
     * the blob-id in the key
     */
    if (key->size > db_get_keysize(db)) 
    {
        ham_offset_t blobid;

        key_set_key(bte, key->data, db_get_keysize(db));

        st=key_insert_extended(&blobid, db, page, key);
        ham_assert(st ? blobid == 0 : 1, (0));
        if (!blobid)
            return st ? st : HAM_INTERNAL_ERROR;

        key_set_extended_rid(db, bte, blobid);
    }

    /*
     * update the btree node-header
     */
    btree_node_set_count(node, count+1);

    return (0);
}
Beispiel #7
0
static ham_status_t
__insert_in_page(ham_page_t *page, ham_key_t *key, 
        ham_offset_t rid, insert_scratchpad_t *scratchpad, 
        insert_hints_t *hints)
{
    ham_status_t st;
    ham_size_t maxkeys=btree_get_maxkeys(scratchpad->be);
    btree_node_t *node=ham_page_get_btree_node(page);

    ham_assert(maxkeys>1, 
            ("invalid result of db_get_maxkeys(): %d", maxkeys));
    ham_assert(hints->force_append == HAM_FALSE, (0));
    ham_assert(hints->force_prepend == HAM_FALSE, (0));

    /*
     * prepare the page for modifications
     */
    st=ham_log_add_page_before(page);
    if (st)
        return (st);

    /*
     * if we can insert the new key without splitting the page: 
     * __insert_nosplit() will do the work for us
     */
    if (btree_node_get_count(node)<maxkeys) {
        st=__insert_nosplit(page, key, rid, 
                    scratchpad->record, scratchpad->cursor, hints);
        scratchpad->cursor=0; /* don't overwrite cursor if __insert_nosplit
                                 is called again */
        return (st);
    }

    /*
     * otherwise, we have to split the page.
     * but BEFORE we split, we check if the key already exists!
     */
    if (btree_node_is_leaf(node)) {
        ham_s32_t idx;

        hints->cost++;
        idx = btree_node_search_by_key(page_get_owner(page), page, key, 
                            HAM_FIND_EXACT_MATCH);
        /* key exists! */
        if (idx>=0) {
            ham_assert((hints->flags & (HAM_DUPLICATE_INSERT_BEFORE
                                |HAM_DUPLICATE_INSERT_AFTER
                                |HAM_DUPLICATE_INSERT_FIRST
                                |HAM_DUPLICATE_INSERT_LAST)) 
                    ? (hints->flags & HAM_DUPLICATE)
                    : 1, (0)); 
            if (!(hints->flags & (HAM_OVERWRITE | HAM_DUPLICATE))) 
                return (HAM_DUPLICATE_KEY);
            st=__insert_nosplit(page, key, rid, 
                    scratchpad->record, scratchpad->cursor, hints);
            /* don't overwrite cursor if __insert_nosplit is called again */
            scratchpad->cursor=0; 
            return (st);
        }
    }

    return (__insert_split(page, key, rid, scratchpad, hints));
}
Beispiel #8
0
static ham_status_t
__append_key(ham_btree_t *be, ham_key_t *key, ham_record_t *record, 
                ham_bt_cursor_t *cursor, insert_hints_t *hints)
{
    ham_status_t st=0;
    ham_page_t *page;
    btree_node_t *node;
    ham_db_t *db;

#ifdef HAM_DEBUG
    if (cursor && !bt_cursor_is_nil(cursor)) {
        ham_assert(be_get_db(be) == bt_cursor_get_db(cursor), (0));
    }
#endif

    db = be_get_db(be);

    /* 
     * see if we get this btree leaf; if not, revert to regular scan 
     *    
     * As this is a speed-improvement hint re-using recent material, the page 
     * should still sit in the cache, or we're using old info, which should be 
     * discarded.
     */
    st = db_fetch_page(&page, db, hints->leaf_page_addr, DB_ONLY_FROM_CACHE);
    if (st)
        return st;
    if (!page) {
        hints->force_append = HAM_FALSE;
        hints->force_prepend = HAM_FALSE;
        return (__insert_cursor(be, key, record, cursor, hints));
    }

    page_add_ref(page);
    node=ham_page_get_btree_node(page);
    ham_assert(btree_node_is_leaf(node), ("iterator points to internal node"));

    /*
     * if the page is already full OR this page is not the right-most page
     * when we APPEND or the left-most node when we PREPEND
     * OR the new key is not the highest key: perform a normal insert
     */
    if ((hints->force_append && btree_node_get_right(node))
            || (hints->force_prepend && btree_node_get_left(node))
            || btree_node_get_count(node) >= btree_get_maxkeys(be)) {
        page_release_ref(page);
        hints->force_append = HAM_FALSE;
        hints->force_prepend = HAM_FALSE;
        return (__insert_cursor(be, key, record, cursor, hints));
    }

    /*
     * if the page is not empty: check if we append the key at the end / start
     * (depending on force_append/force_prepend),
     * or if it's actually inserted in the middle (when neither force_append 
     * or force_prepend is specified: that'd be SEQUENTIAL insertion 
     * hinting somewhere in the middle of the total key range.
     */
    if (btree_node_get_count(node)!=0) {
        int cmp_hi;
        int cmp_lo;

        hints->cost++;
        if (!hints->force_prepend) {
            cmp_hi = key_compare_pub_to_int(db, page, key, 
                                btree_node_get_count(node)-1);
            /* key is in the middle */
            if (cmp_hi < -1) {
                page_release_ref(page);
                return (ham_status_t)cmp_hi;
            }
            /* key is at the end */
            if (cmp_hi > 0) {
                if (btree_node_get_right(node)) {
                    /* not at top end of the btree, so we can't do the 
                     * fast track */
                    page_release_ref(page);
                    //hints->flags &= ~HAM_HINT_APPEND;
                    hints->force_append = HAM_FALSE;
                    hints->force_prepend = HAM_FALSE;
                    return (__insert_cursor(be, key, record, cursor, hints));
                }

                hints->force_append = HAM_TRUE;
                hints->force_prepend = HAM_FALSE;
            }
        }
        else { /* hints->force_prepend is true */
            /* not bigger than the right-most node while we 
             * were trying to APPEND */
            cmp_hi = -1;
        }

        if (!hints->force_append) {
            cmp_lo = key_compare_pub_to_int(db, page, key, 0);
            /* in the middle range */
            if (cmp_lo < -1) {
                page_release_ref(page);
                return ((ham_status_t)cmp_lo);
            }
            /* key is at the start of page */
            if (cmp_lo < 0) {
                if (btree_node_get_left(node)) {
                    /* not at bottom end of the btree, so we can't 
                     * do the fast track */
                    page_release_ref(page);
                    //hints->flags &= ~HAM_HINT_PREPEND;
                    hints->force_append = HAM_FALSE;
                    hints->force_prepend = HAM_FALSE;
                    return (__insert_cursor(be, key, record, cursor, hints));
                }

                hints->force_append = HAM_FALSE;
                hints->force_prepend = HAM_TRUE;
            }
        }
        else { /* hints->force_prepend is true */
            /* not smaller than the left-most node while we were 
             * trying to PREPEND */
            cmp_lo = +1;
        }

        /* handle inserts in the middle range */
        if (cmp_lo >= 0 && cmp_hi <= 0) {
            /*
             * Depending on where we are in the btree, the current key either
             * is going to end up in the middle of the given node/page,
             * OR the given key is out of range of the given leaf node.
             */
            if (hints->force_append || hints->force_prepend) {
                /*
                 * when prepend or append is FORCED, we are expected to 
                 * add keys ONLY at the beginning or end of the btree
                 * key range. Clearly the current key does not fit that
                 * criterium.
                 */
                page_release_ref(page);
                //hints->flags &= ~HAM_HINT_PREPEND;
                hints->force_append = HAM_FALSE;
                hints->force_prepend = HAM_FALSE;
                return (__insert_cursor(be, key, record, cursor, hints));
            }

            /* 
             * we discovered that the key must be inserted in the middle 
             * of the current leaf.
             * 
             * It does not matter whether the current leaf is at the start or
             * end of the btree range; as we need to add the key in the middle
             * of the current leaf, that info alone is enough to continue with
             * the fast track insert operation.
             */
            ham_assert(!hints->force_prepend && !hints->force_append, (0));
        }

        ham_assert((hints->force_prepend + hints->force_append) < 2, 
                ("Either APPEND or PREPEND flag MAY be set, but not both"));
    }
    else { /* empty page: force insertion in slot 0 */
        hints->force_append = HAM_FALSE;
        hints->force_prepend = HAM_TRUE;
    }

    /*
     * the page will be changed - write it to the log (if a log exists)
     */
    st=ham_log_add_page_before(page);
    if (st) {
        page_release_ref(page);
        return (st);
    }

    /*
     * OK - we're really appending/prepending the new key.
     */
    ham_assert(hints->force_append || hints->force_prepend, (0));
    st=__insert_nosplit(page, key, 0, record, cursor, hints);

    page_release_ref(page);
    return (st);
}
ham_s32_t 
btree_node_search_by_key(ham_db_t *db, ham_page_t *page, ham_key_t *key, 
                    ham_u32_t flags)
{
    int cmp; /* [i_a] */
    ham_s32_t slot;
    ham_status_t st;
    btree_node_t *node=ham_page_get_btree_node(page);

    /* ensure the approx flag is NOT set by anyone yet */
    ham_key_set_intflags(key, ham_key_get_intflags(key) & ~KEY_IS_APPROXIMATE);

    if (btree_node_get_count(node)==0)
        return (-1);

    st=btree_get_slot(db, page, key, &slot, &cmp);
    if (st) {
        ham_assert(st < -1, (0));
        return st;
    }

    /*
       'approximate matching'

        When we get here and cmp != 0 and we're looking for LT/GT/LEQ/GEQ 
        key matches, this is where we need to do our prep work.

        Yes, due to the flag tweak in a caller when we have (the usual) 
        multi-page DB table B+tree, both LT and GT flags are 'ON' here, 
        but let's not get carried way and assume that is always
        like that. To elaborate a bit here: it may seem like doing something 
        simple the hard way, but in here, we do NOT know if there are 
        adjacent pages, so 'edge cases' like the scenarios 1, 2, and 5 below 
        should NOT return an error KEY_NOT_FOUND but instead produce a 
        valid slot AND, most important, the accompanying 'sign' (LT/GT) flags 
        for that slot, so that the outer call can analyze our response and 
        shift the key index into the left or right adjacent page, when such 
        is available. We CANNOT see that here, so we always should work with 
        both LT+GT enabled here.
        And to make matters a wee bit more complex still: the one exception 
        to the above is when we have a single-page table: then we get 
        the actual GT/LT flags in here, as we're SURE there won't be any 
        left or right neighbour pages for us to shift into when the need 
        arrises.

        Anyway, the purpose of the next section is to see if we have a 
        matching 'approximate' key AND feed the 'sign' (i.e. LT(-1) or 
        GT(+1)) back to the caller, who knows _exactly_ what the
        user asked for and can thus take the proper action there.

        Here, we are only concerned about determining which key index we 
        should produce, IFF we should produce a matching key.

        Assume the following page layout, with two keys (values 2 and 4):

      * index:
      *    [0]   [1]  
      * +-+---+-+---+-+
      * | | 2 | | 4 | |
      * +-+---+-+---+-+

        Various scenarios apply. For the key search (key ~ 1) i.e. (key=1, 
        flags=NEAR), we get this:

        cmp = -1;
        slot = -1;

        hence we point here:

      *  |
      *  V
      * +-+---+-+---+-+
      * | | 2 | | 4 | |
      * +-+---+-+---+-+

        which is not a valid spot. Should we return a key? YES, since no key 
        is less than '1', but there exists a key '2' which fits as NEAR allows 
        for both LT and GT. Hence, this should be modified to become

        slot=0
        sign=GT

      *     | ( slot++ )
      *     V
      * +-+---+-+---+-+
      * | | 2 | | 4 | |
      * +-+---+-+---+-+


        Second scenario: key <= 1, i.e. (key=1, flags=LEQ)
        which gives us the same as above:

       cmp = -1;
       slot = -1;

        hence we point here:

      *  |
      *  V
      * +-+---+-+---+-+
      * | | 2 | | 4 | |
      * +-+---+-+---+-+

        Should we return a valid slot by adjusting? Your common sense says 
        NO, but the correct answer is YES, since (a) we do not know if the 
        user asked this, as _we_ see it in here as 'key ~ 1' anyway and 
        we must allow the caller to adjust the slot by moving it into the 
        left neighbour page -- an action we cannot do as we do not know, 
        in here, whether there's more pages adjacent to this one we're 
        currently looking at.

        EXCEPT... the common sense answer 'NO' is CORRECT when we have a 
        single-page db table in our hands; see the remark at the top of this 
        comment section; in that case, we can safely say 'NO' after all.

        Third scenario: key ~ 3
        which gives us either:

       cmp = -1;
       slot = 1;

         or

       cmp = 1;
       slot = 0;

         As we check for NEAR instead of just LT or GT, both are okay like 
         that, no adjustment needed.
         All we need to do is make sure sure we pass along the proper LT/GT 
         'sign' flags for outer level result processing.

      
        Fourth scenario: key < 3

        again, we get either:

       cmp = -1;
       slot = 1;

         or

       cmp = 1;
       slot = 0;

        but this time around, since we are looking for LT, we'll need to 
        adjust the second result, when that happens by slot++ and sending 
        the appropriate 'sign' flags.
    
      Fifth scenario: key ~ 5

        which given us:

       cmp = -1;
       slot = 1;

         hence we point here:

      *           |
      *           V
      * +-+---+-+---+-+
      * | | 2 | | 4 | |
      * +-+---+-+---+-+

        Should we return this valid slot? Yup, as long as we mention that 
        it's an LT(less than) key; the caller can see that we returned the 
        slot as the upper bound of this page and adjust accordingly when
        the actual query was 'key > 5' instead of 'key ~ 5' which is how we 
        get to see it.
    */
    /*
      Note that we have a 'preference' for LT answers in here; IFF the user'd 
        asked NEAR questions, most of the time that would give him LT answers, 
        i.e. the answers to NEAR ~ LT questions -- mark the word 'most' in 
        there: this is not happening when we're ending up at a page's lower 
        bound.
     */
    if (cmp)
    {
        /*
         When slot == -1, you're in a special situation: you do NOT know what 
         the comparison with slot[-1] delivers, because there _is_ _no_ slot 
         -1, but you _do_ know what slot[0] delivered: 'cmp' is the
         value for that one then.
         */
        if (slot < 0) 
            slot = 0;

        ham_assert(slot <= btree_node_get_count(node) - 1, (0));

        if (flags & HAM_FIND_LT_MATCH)
        {
            if (cmp < 0)
            {
                /* key @ slot is LARGER than the key we search for ... */
                if (slot > 0)
                {
                    slot--;
                    ham_key_set_intflags(key, ham_key_get_intflags(key) | KEY_IS_LT);
                    cmp = 0;
                }
                else if (flags & HAM_FIND_GT_MATCH)
                {
                    ham_assert(slot == 0, (0));
                    ham_key_set_intflags(key, ham_key_get_intflags(key) | KEY_IS_GT);
                    cmp = 0;
                }
            }
            else
            {
                /* key @ slot is SMALLER than the key we search for */
                ham_assert(cmp > 0, (0));
                ham_key_set_intflags(key, ham_key_get_intflags(key) | KEY_IS_LT);
                cmp = 0;
            }
        } 
        else if (flags&HAM_FIND_GT_MATCH)   
        {
            /*
             When we get here, we're sure HAM_FIND_LT_MATCH is NOT set...
             */
            ham_assert(!(flags&HAM_FIND_LT_MATCH), (0));

            if (cmp < 0)
            {
                /* key @ slot is LARGER than the key we search for ... */
                ham_key_set_intflags(key, ham_key_get_intflags(key) | KEY_IS_GT);
                cmp = 0;
            }
            else
            {
                /* key @ slot is SMALLER than the key we search for */
                ham_assert(cmp > 0, (0));
                if (slot < btree_node_get_count(node) - 1)
                {
                    slot++;
                    ham_key_set_intflags(key, ham_key_get_intflags(key) | KEY_IS_GT);
                    cmp = 0;
                }
            }
        }
    }

    if (cmp)
        return (-1);

    ham_assert(slot >= -1, (0));
    return (slot);
}
/**
 * perform a binary search for the *smallest* element, which is >= the
 * key
 */
ham_status_t 
btree_get_slot(ham_db_t *db, ham_page_t *page, 
        ham_key_t *key, ham_s32_t *slot, int *pcmp)
{
    int cmp = -1;
    btree_node_t *node = ham_page_get_btree_node(page);
    ham_s32_t r = btree_node_get_count(node)-1;
    ham_s32_t l = 1;
    ham_s32_t i;
    ham_s32_t last = MAX_KEYS_PER_NODE + 1;

    ham_assert(btree_node_get_count(node)>0, 
            ("node is empty"));

    /*
     * only one element in this node?
     */
    if (r==0) {
        cmp=key_compare_pub_to_int(db, page, key, 0);
        if (cmp < -1)
            return (ham_status_t)cmp;
        *slot=cmp<0 ? -1 : 0;
        goto bail;
    }

    for (;;) {
        /* [i_a] compare is not needed     (while (r>=0)) */

        /* get the median item; if it's identical with the "last" item,
         * we've found the slot */
        i=(l+r)/2;

        if (i==last) {
            *slot=i;
            cmp=1;
            ham_assert(i >= 0, (0));
            ham_assert(i < MAX_KEYS_PER_NODE + 1, (0));
            break;
        }
        
        /* compare it against the key */
        cmp=key_compare_pub_to_int(db, page, key, (ham_u16_t)i);
        if (cmp < -1)
            return (ham_status_t)cmp;

        /* found it? */
        if (cmp==0) {
            *slot=i;
            break;
        }

        /* if the key is bigger than the item: search "to the left" */
        if (cmp<0) {
            if (r==0) {
                ham_assert(i == 0, (0));
                *slot=-1;
                break;
            }
            r=i-1;
        }
        else {
            last=i;
            l=i+1;
        }
    }
    
bail:
    if (pcmp /* && *slot!=-1 */) {
        /*
           [i_a] reduced the total number of key comparisons; this one is not 
                 needed any more, as it was only really required to 
                 compensate for the (i==last) conditional jump above.
                 So we can simply use 'cmp' as-is.
        */
        *pcmp=cmp;
    }

    return (0);
}