void
stats_flush_dbdata(ham_db_t *db, ham_runtime_statistics_dbdata_t *dbdata, ham_bool_t last_in_env)
{
    /* 
    the freelist statistics are persisted through the freelist destructor,
    which is invoked elsewhere, so all we need to worry about here are the
    'global' db/env oriented find/insert/erase statistics.
    
    TODO: 
    
    persist those in the db header, that is IFF we're a v1.1.0+ DB
    and we're the last one in the environment (or running solo).
    */
    if (last_in_env)
    {
        /* do we have the new freelist statistics persisting format or are we using an older DB format? */
        if (!db_is_mgt_mode_set(db_get_data_access_mode(db), HAM_DAM_ENFORCE_PRE110_FORMAT))
        {

        }
    }
}
Ejemplo n.º 2
0
static ham_size_t
__get_sorted_position(ham_db_t *db, dupe_table_t *table, ham_record_t *record,
                ham_u32_t flags)
{
    ham_duplicate_compare_func_t foo = db_get_duplicate_compare_func(db);
    ham_size_t l, r, m;
    int cmp;
    dupe_entry_t *e;
    ham_record_t item_record;
    ham_u16_t dam;
    ham_status_t st=0;

    /*
     * Use a slightly adapted form of binary search: as we already have our 
     * initial position (as was stored in the cursor), we take that as our
     * first 'median' value and go from there.
     */
    l = 0;
    r = dupe_table_get_count(table) - 1; /* get_count() is 1 too many! */

    /*
     * Maybe Wrong Idea: sequential access/insert doesn't mean the RECORD 
     * values are sequential too! They MAY be, but don't have to!
     *
     * For now, we assume they are also sequential when you're storing records
     * in duplicate-key tables (probably a secondary index table for another
     * table, this one).
     */
    dam = db_get_data_access_mode(db);
    if (dam & HAM_DAM_SEQUENTIAL_INSERT) {
        /* assume the insertion point sits at the end of the dupe table */
        m = r;
    }
    else {
        m = (l + r) / 2;
    }
    ham_assert(m <= r, (0));
    //ham_assert(r >= 1, (0));
        
    while (l <= r) {
        ham_assert(m<dupe_table_get_count(table), (""));

        e = dupe_table_get_entry(table, m);

        memset(&item_record, 0, sizeof(item_record));
        item_record._rid=dupe_entry_get_rid(e);
        item_record._intflags = dupe_entry_get_flags(e)&(KEY_BLOB_SIZE_SMALL
                                                         |KEY_BLOB_SIZE_TINY
                                                         |KEY_BLOB_SIZE_EMPTY);
        st=util_read_record(db, &item_record, 
                            (ham_u64_t *)dupe_entry_get_ridptr(e), flags);
        if (st)
            return (st);

        cmp = foo(db, record->data, record->size, 
                        item_record.data, item_record.size);
        /* item is lower than the left-most item of our range */
        if (m == l) {
            if (cmp < 0)
                break;
        }
        if (l == r) {
            if (cmp >= 0) {
                /* write GEQ record value in NEXT slot */
                m++;
            }
            else /* if (cmp < 0) */ {
                ham_assert(m == r, (0));
            }
            break;
        }
        else if (cmp == 0) {
            /* write equal record value in NEXT slot */
            m++;
            break;
        }
        else if (cmp < 0) {
            if (m == 0) /* new item will be smallest item in the list */
                break;
            r = m - 1;
        }
        else {
            /* write GE record value in NEXT slot, when we have nothing 
             * left to search */
            m++;
            l = m;
        }
        m = (l + r) / 2;
    }

    /* now 'm' points at the insertion point in the table */
    return (m);
}
void 
btree_insert_get_hints(insert_hints_t *hints, ham_db_t *db, ham_key_t *key)
{
    ham_runtime_statistics_dbdata_t *dbdata = db_get_db_perf_data(db);
    ham_bt_cursor_t *cursor = (ham_bt_cursor_t *)hints->cursor;

    ham_assert(hints->force_append == HAM_FALSE, (0));
    ham_assert(hints->force_prepend == HAM_FALSE, (0));
    ham_assert(hints->try_fast_track == HAM_FALSE, (0));

    if ((hints->flags & HAM_HINT_APPEND) && (cursor))
    {
        if (!bt_cursor_is_nil(cursor))
        {
            ham_assert(bt_cursor_is_nil(cursor)==0, ("cursor must not be nil"));
            ham_assert(db == bt_cursor_get_db(cursor), (0));

            /*
             fetch the page of the cursor. We deem the cost of an uncoupled cursor 
             too high as that implies calling a full-fledged key search on the
             given key - which can be rather costly - so we rather wait for the
             statistical cavalry a little later on in this program then.
             */
            if (bt_cursor_get_flags(cursor) & BT_CURSOR_FLAG_COUPLED) 
            {
                ham_page_t *page = bt_cursor_get_coupled_page(cursor);
                btree_node_t *node = ham_page_get_btree_node(page);
                ham_assert(btree_node_is_leaf(node), 
                            ("cursor points to internal node"));
                //ham_assert(!btree_node_get_right(node), ("cursor points to leaf node which is NOT the uppermost/last one"));
                /*
                 * if cursor is not coupled to the LAST (right-most) leaf
                 * in the Database, it does not make sense to append
                 */
                if (btree_node_get_right(node)) {
                    hints->force_append = HAM_FALSE;
                    hints->try_fast_track = HAM_FALSE;
                }
                else {
                    hints->leaf_page_addr = page_get_self(page);
                    hints->force_append = HAM_TRUE;
                    hints->try_fast_track = HAM_TRUE;
                }
            }
        }
    }
    else if ((hints->flags & HAM_HINT_PREPEND) && (cursor))
    {
        if (!bt_cursor_is_nil(cursor))
        {
            ham_assert(bt_cursor_is_nil(cursor)==0, ("cursor must not be nil"));
            ham_assert(db == bt_cursor_get_db(cursor), (0));

            /*
             fetch the page of the cursor. We deem the cost of an uncoupled cursor 
             too high as that implies calling a full-fledged key search on the
             given key - which can be rather costly - so we rather wait for the
             statistical cavalry a little later on in this program then.
             */
            if (bt_cursor_get_flags(cursor) & BT_CURSOR_FLAG_COUPLED) 
            {
                ham_page_t *page = bt_cursor_get_coupled_page(cursor);
                btree_node_t *node = ham_page_get_btree_node(page);
                ham_assert(btree_node_is_leaf(node), 
                        ("cursor points to internal node"));
                //ham_assert(!btree_node_get_left(node), ("cursor points to leaf node which is NOT the lowest/first one"));
                /*
                 * if cursor is not coupled to the FIRST (left-most) leaf
                 * in the Database, it does not make sense to prepend
                 */
                if (btree_node_get_left(node)) {
                    hints->force_prepend = HAM_FALSE;
                    hints->try_fast_track = HAM_FALSE;
                }
                else {
                    hints->leaf_page_addr = page_get_self(page);
                    hints->force_prepend = HAM_TRUE;
                    hints->try_fast_track = HAM_TRUE;
                }
            }
        }
    }
    //hints->flags &= ~(HAM_HINT_APPEND | HAM_HINT_PREPEND);

    /* 
    The statistical cavalry:

    - when the given key is positioned beyond the end, hint 'append' anyway.

    - When the given key is positioned before the start, hint 'prepend' anyway.
    
    NOTE: This 'auto-detect' mechanism (thanks to the key bounds being collected through
    the statistics gathering calls) renders the manual option HAM_HINT_APPEND/_PREPEND
    somewhat obsolete, really. 

    The only advantage of manually specifying HAM_HINT_APPEND/_PREPEND is that it can save you
    two key comparisons in here.
    */
    ham_assert(!(key->_flags & KEY_IS_EXTENDED), (0));
    key->_flags &= ~KEY_IS_EXTENDED;

    if (!hints->try_fast_track)
    {
        ham_runtime_statistics_opdbdata_t *opstats = db_get_op_perf_data(db, HAM_OPERATION_STATS_INSERT);

        ham_assert(opstats != NULL, (0));

        if (hints->flags & (HAM_HINT_APPEND | HAM_HINT_PREPEND))
        {
            /* find specific: APPEND / PREPEND --> SEQUENTIAL */
            hints->flags &= ~(HAM_HINT_APPEND | HAM_HINT_PREPEND); 
            hints->flags |= HAM_HINT_SEQUENTIAL;
        }

        if ((hints->flags & HAM_HINTS_MASK) == 0)
        {
            /* no local preference specified; go with the DB-wide DAM config */
            switch (db_get_data_access_mode(db) & ~HAM_DAM_ENFORCE_PRE110_FORMAT)
            {
            default:
                break;

            case HAM_DAM_SEQUENTIAL_INSERT:
                hints->flags |= HAM_HINT_SEQUENTIAL;
                break;
            }
        }

        switch (hints->flags & HAM_HINTS_MASK)
        {
        default:
        case HAM_HINT_RANDOM_ACCESS:
            /* do not provide any hints for the fast track */
            break;

        case HAM_HINT_SEQUENTIAL:
            /*
            when we have more than 4 hits on the same page already, we'll assume this one 
            will end up there as well. As this counter will reset itself on the first FAIL,
            there's no harm in acting this quick. In pathological cases, the worst what
            can happen is that in 20% of cases there will be performed an extra check on
            a cached btree leaf node, which is still minimal overhead then.
            */
            if (opstats->btree_last_page_sq_hits >= 3)
            {
                hints->leaf_page_addr = opstats->btree_last_page_addr;
                hints->try_fast_track = HAM_TRUE;
                break;
            }
            /* fall through! */
            if (0)
            {
        case HAM_HINT_SEQUENTIAL | HAM_HINT_UBER_FAST_ACCESS:
            /* same as above, but now act as fast as possible on this info */
            if (opstats->btree_last_page_sq_hits >= 1)
            {
                hints->leaf_page_addr = opstats->btree_last_page_addr;
                hints->try_fast_track = HAM_TRUE;
                break;
            }
            }
            {
                /* 
                we assume this request is located near the previous request, so we check
                if there's anything in the statistics that can help out.

                Note #1: since the hinting counts are 'aged' down to a value of 0..1K (with 2K peak),
                we don't need to use a 64-bit integer for the ratio calculation here.

                Note #2: the ratio is only 'trustworthy' when the base count is about 4 or higher.
                This is because the aging rounds up while scaling down, which means one single FAIL
                can get you a ratio as large as 50% when total count is 1 as well, due to
                either startup or aging rescale; without this minimum size check, the ratio + aging
                would effectively stop the hinter from working after either an aging step or when
                a few FAILs happen during the initial few FIND operations (startup condition).

                EDIT: the above bit about the hinter stopping due to too much FAIL at start or after
                rescale does NOT apply any more as the hinter now also includes checks which trigger
                when a (small) series of hits on the same page are found, which acts as a restarter
                for this as well.
                */
                ham_u32_t ratio = opstats->btree_hinting_fail_count;

                ratio = ratio * 1000 / (1 + opstats->btree_hinting_count);
                if (ratio < 200)
                {
                    hints->leaf_page_addr = opstats->btree_last_page_addr;
                    hints->try_fast_track = HAM_TRUE;
                    hints->force_append = HAM_TRUE;
                }
            }
            
            if (dbdata->lower_bound_set)
            {
                if (dbdata->lower_bound_index == 1)
                {
                    /*
                    impossible index: this is a marker to signal the table 
                    is completely empty
                    */
                    //hints->flags |= HAM_HINT_PREPEND;
                    hints->force_prepend = HAM_TRUE;
                    hints->leaf_page_addr = dbdata->lower_bound_page_address;
                    hints->try_fast_track = HAM_TRUE;
                }
                else
                {
                    int cmp;
                    
                    ham_assert(dbdata->lower_bound_index == 0, (0));
                    ham_assert(dbdata->lower_bound.data == NULL ?
                        dbdata->lower_bound.size == 0 : 
                        dbdata->lower_bound.size > 0, (0));
                    ham_assert(dbdata->lower_bound_page_address != 0, (0));
                    cmp = db_compare_keys(db, key, &dbdata->lower_bound);

                    if (cmp < 0)
                    {
                        //hints->flags |= HAM_HINT_PREPEND;
                        hints->force_prepend = HAM_TRUE;
                        hints->leaf_page_addr = dbdata->lower_bound_page_address;
                        hints->try_fast_track = HAM_TRUE;
                    }
                }
            }

            if (dbdata->upper_bound_set)
            {
                int cmp;
                
                ham_assert(dbdata->upper_bound_index >= 0, (0));
                ham_assert(dbdata->upper_bound.data == NULL ?
                    dbdata->upper_bound.size == 0 : 
                    dbdata->upper_bound.size > 0, (0));
                ham_assert(dbdata->upper_bound_page_address != 0, (0));
                cmp = db_compare_keys(db, key, &dbdata->upper_bound);

                if (cmp > 0)
                {
                    //hints->flags |= HAM_HINT_APPEND;
                    hints->force_append = HAM_TRUE;
                    hints->leaf_page_addr = dbdata->upper_bound_page_address;
                    hints->try_fast_track = HAM_TRUE;
                }
            }
            break;
        }
    }

    /* 
    we don't yet hint about jumping to the last accessed leaf node immediately (yet)
    
    EDIT:
    
    now we do: see the flags + dam code above: this happens when neither PREPEND 
    nor APPEND hints are specified 
    */
}
/*
 * NOTE:
 * 
 * The current statistics collectors recognize scenarios where insert & 
 * delete mix with find, as both insert and delete (pardon, erase)
 * can split/merge/rebalance the B-tree and thus completely INVALIDATE
 * btree leaf nodes, the address of which is kept in DB-wide statistics
 * storage. The current way of doing things is to keep the statistics simple, 
 * i.e. btree leaf node pointers are nuked when an insert operation splits 
 * them or an erase operation merges or erases such pages. I don't think 
 * it's really useful to have complex leaf-node tracking in here to improve 
 * hinting in such mixed use cases.
 */
void 
btree_find_get_hints(find_hints_t *hints, ham_db_t *db, ham_key_t *key)
{
    ham_runtime_statistics_dbdata_t *dbdata = db_get_db_perf_data(db);
    ham_runtime_statistics_opdbdata_t *opstats = db_get_op_perf_data(db, HAM_OPERATION_STATS_FIND);
    ham_u32_t flags = hints->flags;

    ham_assert(hints->key_is_out_of_bounds == HAM_FALSE, (0));
    ham_assert(hints->try_fast_track == HAM_FALSE, (0));

    /*
    we can only give some possibly helpful hints, when we
    know the tree leaf node (page) we can direct find() to...
    */
    if (opstats->btree_last_page_addr != 0)
    {
        /*
        When we're in SEQUENTIAL mode, we'll advise to check the previously used leaf.
        When the FAIL ratio increases above a certain number, we STOP hinting as we
        clearly hinted WRONG before. We'll try again later, though.

        Note also that we 'age' the HINT FAIL info collected during FIND statistics gathering,
        so that things will be attempted again after while.
        */
        if (flags & (HAM_HINT_APPEND | HAM_HINT_PREPEND))
        {
            /* find specific: APPEND / PREPEND --> SEQUENTIAL */
            flags &= ~(HAM_HINT_APPEND | HAM_HINT_PREPEND); 
            flags |= HAM_HINT_SEQUENTIAL;
        }

        if ((flags & HAM_HINTS_MASK) == 0)
        {
            /* no local preference specified; go with the DB-wide DAM config */
            switch (db_get_data_access_mode(db) & ~HAM_DAM_ENFORCE_PRE110_FORMAT)
            {
            default:
                break;

            case HAM_DAM_SEQUENTIAL_INSERT:
                flags = HAM_HINT_SEQUENTIAL;
                break;
            }
        }

        switch (flags & HAM_HINTS_MASK)
        {
        default:
        case HAM_HINT_RANDOM_ACCESS:
            /* do not provide any hints for the fast track */
            break;

        case HAM_HINT_SEQUENTIAL:
            /*
            when we have more than 4 hits on the same page already, we'll assume this one 
            will end up there as well. As this counter will reset itself on the first FAIL,
            there's no harm in acting this quick. In pathological cases, the worst what
            can happen is that in 20% of cases there will be performed an extra check on
            a cached btree leaf node, which is still minimal overhead then.
            */
            if (opstats->btree_last_page_sq_hits >= 3)
            {
                hints->leaf_page_addr = opstats->btree_last_page_addr;
                hints->try_fast_track = HAM_TRUE;
                break;
            }
            /* fall through! */
            if (0)
            {
        case HAM_HINT_SEQUENTIAL | HAM_HINT_UBER_FAST_ACCESS:
                /* same as above, but now act as fast as possible on this info */
                if (opstats->btree_last_page_sq_hits >= 1)
                {
                    hints->leaf_page_addr = opstats->btree_last_page_addr;
                    hints->try_fast_track = HAM_TRUE;
                    break;
                }
            }
            {
                /* 
                we assume this request is located near the previous request, so we check
                if there's anything in the statistics that can help out.

                Note #1: since the hinting counts are 'aged' down to a value of 0..1K (with 2K peak),
                we don't need to use a 64-bit integer for the ratio calculation here.

                Note #2: the ratio is only 'trustworthy' when the base count is about 4 or higher.
                This is because the aging rounds up while scaling down, which means one single FAIL
                can get you a ratio as large as 50% when total count is 1 as well, due to
                either startup or aging rescale; without this minimum size check, the ratio + aging
                would effectively stop the hinter from working after either an aging step or when
                a few FAILs happen during the initial few FIND operations (startup condition).

                EDIT: the above bit about the hinter stopping due to too much FAIL at start or after
                rescale does NOT apply any more as the hinter now also includes checks which trigger
                when a (small) series of hits on the same page are found, which acts as a restarter
                for this as well.
                */
                ham_u32_t ratio = opstats->btree_hinting_fail_count;

                ratio = ratio * 1000 / (1 + opstats->btree_hinting_count);
                if (ratio < 200)
                {
                    hints->leaf_page_addr = opstats->btree_last_page_addr;
                    hints->try_fast_track = HAM_TRUE;
                }
            }
            break;
        }
    }

    /* 
    age the hinting statistics
    
    This is different from the need to rescale the statistics data, as the latter is due to
    the risk of integer overflow when accounting for a zillion operations.
    
    Instead, the hinting costs are 'aged' to reduce the influence of older hinting 
    results on subsequent hinter output.

    The way this aging happens here results in hinting_count traveling asymptotically towards
    1K with an upper bound of the count of 2K, while fail_count will be equal or lower to
    these numbers.

    And, yes, this also means the hinting counters will NOT be rescaled by the DB rescaler;
    hinting counts act independently.
    */
    opstats->aging_tracker++;
    if (opstats->aging_tracker >= 1000)
    {
        rescale_2(opstats->btree_hinting_fail_count);
        rescale_2(opstats->btree_hinting_count);

        opstats->aging_tracker = 0;
    }

    /*
    and lastly check whether the key is out of range: when the adequate LE/GE search flags
    are not set in such a case, we can quickly decide right here that a match won't be 
    forthcoming: KEY_NOT_FOUND will be your thanks.

    One might want to add this extra 2 key comparison overhead only for
    'large' databases, i.e. databases which consist of more than 1 btree page 
    (--> lower bound page address != upper bound page address) in order to keep this overhead
    to the bare minimum under all circumstances.

    THOUGHT: However, even with a tiny, single btree page database, 
    it takes the in-page binary search log2(N) key comparisons to find out we've hit an out of
    bounds key, where N is the number of keys currently stored in the btree page, so we MAY already
    benefit from this when there's a large number of keys stored in this single btree page database...

    Say we allow a 5% overhead --> 2 key comparisons ~ 5% --> minimum key count in page = 2^40 keys.
    Which we'll never store in a single page as it is limited to 2^16 keys.

    Conclusion: only do this out-of-bounds check for multipage databases.

    And when the previous section of the hinter already produced some hints about where we might
    expect to hit (btree leaf page), we'll take that hint into account, assuming it's correct.
    And if it is not, there's nothing bad happening, except that the bounds check has
    been skipped so btree_find() will take the long (classic) route towards finding out that
    a lower or upper bound was hit.
    */
    ham_assert(!(key->_flags & KEY_IS_EXTENDED), (0));
    key->_flags &= ~KEY_IS_EXTENDED;

    if (!db_is_mgt_mode_set(flags, HAM_FIND_LT_MATCH | HAM_FIND_GT_MATCH)
        && dbdata->lower_bound_page_address != dbdata->upper_bound_page_address
        && (hints->try_fast_track 
        ? (dbdata->lower_bound_page_address == hints->leaf_page_addr
            || dbdata->upper_bound_page_address == hints->leaf_page_addr)
            : HAM_TRUE))
    {
        if (dbdata->lower_bound_set
            && !db_is_mgt_mode_set(flags, HAM_FIND_GT_MATCH))
        {
            if (dbdata->lower_bound_index == 1)
            {
                /*
                impossible index: this is a marker to signal the table 
                is completely empty
                */
                hints->key_is_out_of_bounds = HAM_TRUE;
                hints->try_fast_track = HAM_TRUE;
            }
            else
            {
                int cmp;
            
                ham_assert(dbdata->lower_bound_index == 0, (0));
                ham_assert(dbdata->lower_bound.data == NULL ?
                    dbdata->lower_bound.size == 0 : 
                    dbdata->lower_bound.size > 0, (0));
                ham_assert(dbdata->lower_bound_page_address != 0, (0));
                cmp = db_compare_keys(db, key, &dbdata->lower_bound);

                if (cmp < 0)
                {
                    hints->key_is_out_of_bounds = HAM_TRUE;
                    hints->try_fast_track = HAM_TRUE;
                }
            }
        }

        if (dbdata->upper_bound_set
            && !db_is_mgt_mode_set(flags, HAM_FIND_LT_MATCH))
        {
            int cmp;
            
            ham_assert(dbdata->upper_bound_index >= 0, (0));
            ham_assert(dbdata->upper_bound.data == NULL ?
                dbdata->upper_bound.size == 0 : 
                dbdata->upper_bound.size > 0, (0));
            ham_assert(dbdata->upper_bound_page_address != 0, (0));
            cmp = db_compare_keys(db, key, &dbdata->upper_bound);

            if (cmp > 0)
            {
                hints->key_is_out_of_bounds = HAM_TRUE;
                hints->try_fast_track = HAM_TRUE;
            }
        }
    }
}
Ejemplo n.º 5
0
static ham_status_t
__insert_split(ham_page_t *page, ham_key_t *key, 
        ham_offset_t rid, insert_scratchpad_t *scratchpad, 
        insert_hints_t *hints)
{
    int cmp;
    ham_status_t st;
    ham_page_t *newpage, *oldsib;
    int_key_t *nbte, *obte;
    btree_node_t *nbtp, *obtp, *sbtp;
    ham_size_t count, keysize;
    ham_db_t *db=page_get_owner(page);
    ham_env_t *env = db_get_env(db);
    ham_key_t pivotkey, oldkey;
    ham_offset_t pivotrid;
    ham_u16_t pivot;
    ham_bool_t pivot_at_end=HAM_FALSE;

    ham_assert(page_get_owner(page), (0));
    ham_assert(device_get_env(page_get_device(page)) 
            == db_get_env(page_get_owner(page)), (0));

    ham_assert(hints->force_append == HAM_FALSE, (0));

    keysize=db_get_keysize(db);

    /*
     * allocate a new page
     */
    hints->cost++;
    st=db_alloc_page(&newpage, db, PAGE_TYPE_B_INDEX, 0); 
    ham_assert(st ? page == NULL : 1, (0));
    ham_assert(!st ? page  != NULL : 1, (0));
    if (st)
        return st; 
    ham_assert(page_get_owner(newpage), (""));
    /* clear the node header */
    memset(page_get_payload(newpage), 0, sizeof(btree_node_t));

    stats_page_is_nuked(db, page, HAM_TRUE);

    /*
     * move half of the key/rid-tuples to the new page
     *
     * !! recno: keys are sorted; we do a "lazy split"
     */
    nbtp=ham_page_get_btree_node(newpage);
    nbte=btree_node_get_key(db, nbtp, 0);
    obtp=ham_page_get_btree_node(page);
    obte=btree_node_get_key(db, obtp, 0);
    count=btree_node_get_count(obtp);

    /*
     * for databases with sequential access (this includes recno databases):
     * do not split in the middle, but at the very end of the page
     *
     * if this page is the right-most page in the index, and this key is 
     * inserted at the very end, then we select the same pivot as for
     * sequential access
     */
    if (db_get_data_access_mode(db)&HAM_DAM_SEQUENTIAL_INSERT)
        pivot_at_end=HAM_TRUE;
    else if (btree_node_get_right(obtp)==0) {
        cmp=key_compare_pub_to_int(db, page, key, btree_node_get_count(obtp)-1);
        if (cmp>0)
            pivot_at_end=HAM_TRUE;
    }

    /*
     * internal pages set the count of the new page to count-pivot-1 (because
     * the pivot element will become ptr_left of the new page).
     * by using pivot=count-2 we make sure that at least 1 element will remain
     * in the new node.
     */
    if (pivot_at_end) {
        pivot=count-2;
    }
    else {
        pivot=count/2;
    }

    /*
     * uncouple all cursors
     */
    st=bt_uncouple_all_cursors(page, pivot);
    if (st)
        return (st);

    /*
     * if we split a leaf, we'll insert the pivot element in the leaf
     * page, too. in internal nodes, we don't insert it, but propagate
     * it to the parent node only.
     */
    if (btree_node_is_leaf(obtp)) {
        hints->cost += stats_memmove_cost((db_get_int_key_header_size()+keysize)*(count-pivot));
        memcpy((char *)nbte,
               ((char *)obte)+(db_get_int_key_header_size()+keysize)*pivot, 
               (db_get_int_key_header_size()+keysize)*(count-pivot));
    }
    else {
        hints->cost += stats_memmove_cost((db_get_int_key_header_size()+keysize)*(count-pivot-1));
        memcpy((char *)nbte,
               ((char *)obte)+(db_get_int_key_header_size()+keysize)*(pivot+1), 
               (db_get_int_key_header_size()+keysize)*(count-pivot-1));
    }
    
    /* 
     * store the pivot element, we'll need it later to propagate it 
     * to the parent page
     */
    nbte=btree_node_get_key(db, obtp, pivot);

    memset(&pivotkey, 0, sizeof(pivotkey));
    memset(&oldkey, 0, sizeof(oldkey));
    oldkey.data=key_get_key(nbte);
    oldkey.size=key_get_size(nbte);
    oldkey._flags=key_get_flags(nbte);
    st = util_copy_key(db, &oldkey, &pivotkey);
    if (st) 
    {
        (void)db_free_page(newpage, DB_MOVE_TO_FREELIST);
        goto fail_dramatically;
    }
    pivotrid=page_get_self(newpage);

    /*
     * adjust the page count
     */
    if (btree_node_is_leaf(obtp)) {
        btree_node_set_count(obtp, pivot);
        btree_node_set_count(nbtp, count-pivot);
    }
    else {
        btree_node_set_count(obtp, pivot);
        btree_node_set_count(nbtp, count-pivot-1);
    }

    /*
     * if we're in an internal page: fix the ptr_left of the new page
     * (it points to the ptr of the pivot key)
     */ 
    if (!btree_node_is_leaf(obtp)) {
        /* 
         * nbte still contains the pivot key 
         */
        btree_node_set_ptr_left(nbtp, key_get_ptr(nbte));
    }

    /*
     * insert the new element
     */
    hints->cost++;
    cmp=key_compare_pub_to_int(db, page, key, pivot);
    if (cmp < -1) 
    {
        st = (ham_status_t)cmp;
        goto fail_dramatically;
    }

    if (cmp>=0)
        st=__insert_nosplit(newpage, key, rid, 
                scratchpad->record, scratchpad->cursor, hints);
    else
        st=__insert_nosplit(page, key, rid, 
                scratchpad->record, scratchpad->cursor, hints);
    if (st) 
    {
        goto fail_dramatically;
    }
    scratchpad->cursor=0; /* don't overwrite cursor if __insert_nosplit
                             is called again */

    /*
     * fix the double-linked list of pages, and mark the pages as dirty
     */
    if (btree_node_get_right(obtp)) 
    {
        st=db_fetch_page(&oldsib, db, btree_node_get_right(obtp), 0);
        if (st)
            goto fail_dramatically;
    }
    else
    {
        oldsib=0;
    }

    if (oldsib) {
        st=ham_log_add_page_before(oldsib);
        if (st)
            goto fail_dramatically;
    }

    btree_node_set_left (nbtp, page_get_self(page));
    btree_node_set_right(nbtp, btree_node_get_right(obtp));
    btree_node_set_right(obtp, page_get_self(newpage));
    if (oldsib) {
        sbtp=ham_page_get_btree_node(oldsib);
        btree_node_set_left(sbtp, page_get_self(newpage));
        page_set_dirty(oldsib, env);
    }
    page_set_dirty(newpage, env);
    page_set_dirty(page, env);

    /* 
     * propagate the pivot key to the parent page
     */
    ham_assert(!(scratchpad->key.flags & HAM_KEY_USER_ALLOC), (0));
    if (scratchpad->key.data)
        allocator_free(env_get_allocator(env), scratchpad->key.data);
    scratchpad->key=pivotkey;
    scratchpad->rid=pivotrid;
    ham_assert(!(scratchpad->key.flags & HAM_KEY_USER_ALLOC), (0));

    return (SPLIT);

fail_dramatically:
    ham_assert(!(pivotkey.flags & HAM_KEY_USER_ALLOC), (0));
    if (pivotkey.data)
        allocator_free(env_get_allocator(env), pivotkey.data);
    return st;
}