Beispiel #1
0
static ham_status_t
__get_duplicate_table(dupe_table_t **table_ref, ham_page_t **page, ham_env_t *env, ham_u64_t table_id)
{
    ham_status_t st;
    blob_t hdr;
    ham_page_t *hdrpage=0;
    dupe_table_t *table;

	*page = 0;
    if (env_get_rt_flags(env)&HAM_IN_MEMORY_DB) {
        ham_u8_t *p=(ham_u8_t *)U64_TO_PTR(table_id); 
        *table_ref = (dupe_table_t *)(p+sizeof(hdr));
		return HAM_SUCCESS;
    }

	*table_ref = 0;

    /*
     * load the blob header
     */
    st=__read_chunk(env, 0, &hdrpage, table_id, (ham_u8_t *)&hdr, sizeof(hdr));
    if (st) {
        return st;
    }

    /*
     * if the whole table is in a page (and not split between several
     * pages), just return a pointer directly in the page
     */
    if (page_get_self(hdrpage)+env_get_usable_pagesize(env) >=
            table_id+blob_get_size(&hdr)) 
    {
        ham_u8_t *p=page_get_raw_payload(hdrpage);
        /* yes, table is in the page */
        *page=hdrpage;
        *table_ref = (dupe_table_t *)
                &p[table_id-page_get_self(hdrpage)+sizeof(hdr)];
		return HAM_SUCCESS;
    }

    /*
     * otherwise allocate memory for the table
     */
    table=allocator_alloc(env_get_allocator(env), (ham_size_t)blob_get_size(&hdr));
    if (!table) {
        return HAM_OUT_OF_MEMORY;
    }

    /*
     * then read the rest of the blob
     */
    st=__read_chunk(env, hdrpage, 0, table_id+sizeof(hdr), 
            (ham_u8_t *)table, (ham_size_t)blob_get_size(&hdr));
    if (st) {
        return st;
    }

    *table_ref = table;
	return HAM_SUCCESS;
}
ham_status_t 
cache_remove_page(ham_cache_t *cache, ham_page_t *page)
{
    ham_bool_t removed = HAM_FALSE;

    if (page_get_self(page)) 
    {
        ham_size_t hash=__calc_hash(cache, page_get_self(page));
        if (page_is_in_list(cache_get_bucket(cache, hash), page, 
                PAGE_LIST_BUCKET)) {
            cache_set_bucket(cache, hash, 
                    page_list_remove(cache_get_bucket(cache, hash), 
                    PAGE_LIST_BUCKET, page));
        }
    }

    if (page_is_in_list(cache_get_totallist(cache), page, PAGE_LIST_CACHED)) {
        cache_set_totallist(cache, page_list_remove(cache_get_totallist(cache), 
                PAGE_LIST_CACHED, page));
        removed = HAM_TRUE;
    }
    if (page_is_in_list(cache_get_garbagelist(cache), page, PAGE_LIST_GARBAGE)){
        cache_set_garbagelist(cache, 
                    page_list_remove(cache_get_garbagelist(cache), 
                    PAGE_LIST_GARBAGE, page));
        removed = HAM_TRUE;
    }
    if (removed) {
        cache_set_cur_elements(cache, 
                cache_get_cur_elements(cache)-1);
    }

    return (0);
}
ham_status_t
txn_add_page(ham_txn_t *txn, ham_page_t *page, ham_bool_t ignore_if_inserted)
{
    /*
     * don't re-insert, if 'ignore_if_inserted' is true
     */
    if (ignore_if_inserted && txn_get_page(txn, page_get_self(page)))
        return (0);

#ifdef HAM_DEBUG
    /*
     * check if the page is already in the transaction's pagelist - 
     * that would be a bug
     */
    ham_assert(txn_get_page(txn, page_get_self(page))==0, 
            ("page 0x%llx is already in the txn", page_get_self(page)));
#endif

    /*
     * not found? add the page
     */
    page_add_ref(page);

    ham_assert(!page_is_in_list(txn_get_pagelist(txn), page, PAGE_LIST_TXN), (0));
    txn_set_pagelist(txn, page_list_insert(txn_get_pagelist(txn), 
            PAGE_LIST_TXN, page));

    return (HAM_SUCCESS);
}
/*
 * when the last hit leaf node is split or shrunk, blow it away for all operations!
 *
 * Also blow away a page when a transaction aborts which has modified this page. We'd rather
 * reconstruct our critical statistics then carry the wrong bounds, etc. around.
 *
 * This is done to prevent the hinter from hinting/pointing at an (by now)
 * INVALID btree node later on!
 */
void 
stats_page_is_nuked(ham_db_t *db, struct ham_page_t *page, ham_bool_t split)
{
    ham_runtime_statistics_dbdata_t *dbdata = db_get_db_perf_data(db);
    ham_env_t *env = db_get_env(db);
    int i;

    for (i = 0; i <= 2; i++)
    {
        ham_runtime_statistics_opdbdata_t *opstats = db_get_op_perf_data(db, i);

        ham_assert(i == HAM_OPERATION_STATS_FIND
                    || i == HAM_OPERATION_STATS_INSERT
                    || i == HAM_OPERATION_STATS_ERASE, (0));

        if (opstats->btree_last_page_addr == page_get_self(page))
        {
            opstats->btree_last_page_addr = 0;
            opstats->btree_last_page_sq_hits = 0;
        }
    }

    if (dbdata->lower_bound_page_address == page_get_self(page))
    {
        if (dbdata->lower_bound.data)
        {
            ham_assert(env_get_allocator(env) != 0, (0));
            allocator_free(env_get_allocator(env), dbdata->lower_bound.data);
        }
        memset(&dbdata->lower_bound, 0, sizeof(dbdata->lower_bound));
        dbdata->lower_bound_index = 0;
        dbdata->lower_bound_page_address = 0;
        dbdata->lower_bound_set = HAM_FALSE;
    }

	if (dbdata->upper_bound_page_address == page_get_self(page))
    {
        if (dbdata->upper_bound.data)
        {
            ham_assert(env_get_allocator(env) != 0, (0));
            allocator_free(env_get_allocator(env), dbdata->upper_bound.data);
        }
        memset(&dbdata->upper_bound, 0, sizeof(dbdata->upper_bound));
        dbdata->upper_bound_index = 0;
        dbdata->upper_bound_page_address = 0;
        dbdata->upper_bound_set = HAM_FALSE;
    }
}
void 
stats_update(int op, ham_db_t *db, ham_page_t *page, ham_size_t cost, ham_bool_t try_fast_track)
{
    ham_runtime_statistics_dbdata_t *dbstats = db_get_db_perf_data(db);
    ham_runtime_statistics_opdbdata_t *opstats = db_get_op_perf_data(db, op);

    ham_assert(op == HAM_OPERATION_STATS_FIND
                || op == HAM_OPERATION_STATS_INSERT
                || op == HAM_OPERATION_STATS_ERASE, (0));
    ham_assert(page, (0));

    /*
     * Again, cost is the fastest riser, so we check that one against a high water mark
     * to decide whether to rescale or not
     */
    if (dbstats->rescale_tracker >= HAM_STATISTICS_HIGH_WATER_MARK - cost)
    {
        rescale_db_stats(dbstats);
    }
    dbstats->rescale_tracker += cost;

    opstats->btree_count++;
    //opstats->btree_fail_count++;
    opstats->btree_cost += cost;
    //opstats->btree_fail_cost += cost;

    /*
     * when we got a hint, account for it's success/failure
     */
    if (try_fast_track)
    {
        if (opstats->btree_last_page_addr != page_get_self(page))
        {
            opstats->btree_hinting_fail_count++;
        }
        opstats->btree_hinting_count++;
    }
    
    if (opstats->btree_last_page_addr
        && opstats->btree_last_page_addr == page_get_self(page))
    {
        opstats->btree_last_page_sq_hits++;
    }
    else
    {
        opstats->btree_last_page_addr = page_get_self(page);
    }
}
ham_page_t *
cache_get_page(ham_cache_t *cache, ham_offset_t address, ham_u32_t flags)
{
    ham_page_t *page;
    ham_size_t hash=__calc_hash(cache, address);

    page=cache_get_bucket(cache, hash);
    while (page) {
        if (page_get_self(page)==address)
            break;
        page=page_get_next(page, PAGE_LIST_BUCKET);
    }

    if (page && flags != CACHE_NOREMOVE) {
        if (page_is_in_list(cache_get_totallist(cache), page, PAGE_LIST_CACHED))
        {
            cache_set_totallist(cache, 
                page_list_remove(cache_get_totallist(cache), 
                PAGE_LIST_CACHED, page));
        }
        ham_assert(page_is_in_list(cache_get_bucket(cache, hash), page, 
                PAGE_LIST_BUCKET), (0));
        cache_set_bucket(cache, hash,
            page_list_remove(cache_get_bucket(cache, 
            hash), PAGE_LIST_BUCKET, page));

        cache_set_cur_elements(cache, 
            cache_get_cur_elements(cache)-1);
    }

    return (page);
}
ham_status_t 
cache_put_page(ham_cache_t *cache, ham_page_t *page)
{
    ham_size_t hash=__calc_hash(cache, page_get_self(page));
    ham_bool_t new_page = HAM_TRUE;

    ham_assert(page_get_pers(page), (""));

    if (page_is_in_list(cache_get_totallist(cache), page, PAGE_LIST_CACHED)) {
        cache_set_totallist(cache, 
                page_list_remove(cache_get_totallist(cache), 
                PAGE_LIST_CACHED, page));

        new_page = HAM_FALSE;

        cache_set_cur_elements(cache, 
                cache_get_cur_elements(cache)-1);
    }
    ham_assert(!page_is_in_list(cache_get_totallist(cache), page, 
                PAGE_LIST_CACHED), (0));
    cache_set_totallist(cache, 
            page_list_insert(cache_get_totallist(cache), 
            PAGE_LIST_CACHED, page));

    cache_set_cur_elements(cache, 
            cache_get_cur_elements(cache)+1);

    /*
     * insert it in the cache bucket
     * !!!
     * to avoid inserting the page twice, we first remove it from the 
     * bucket
     */
    if (page_is_in_list(cache_get_bucket(cache, hash), page, PAGE_LIST_BUCKET))
    {
        cache_set_bucket(cache, hash, page_list_remove(cache_get_bucket(cache, 
                    hash), PAGE_LIST_BUCKET, page));
    }
    ham_assert(!page_is_in_list(cache_get_bucket(cache, hash), page, 
                PAGE_LIST_BUCKET), (0));
    cache_get_bucket(cache, hash)=page_list_insert(cache_get_bucket(cache, 
                hash), PAGE_LIST_BUCKET, page);

    return (0);
}
ham_page_t *
txn_get_page(ham_txn_t *txn, ham_offset_t address)
{
    ham_page_t *p=txn_get_pagelist(txn);

#ifdef HAM_DEBUG
    ham_page_t *start=p;
#endif

    while (p) {
        ham_offset_t o=page_get_self(p);
        if (o==address)
            return (p);
        p=page_get_next(p, PAGE_LIST_TXN);
        ham_assert(start!=p, ("circular reference in page-list"));
    }

    return (0);
}
Beispiel #9
0
static ham_status_t
__read_chunk(ham_env_t *env, ham_page_t *page, ham_page_t **fpage, 
        ham_offset_t addr, ham_u8_t *data, ham_size_t size)
{
    ham_status_t st;
    ham_device_t *device=env_get_device(env);

    while (size) {
        /*
         * get the page-ID from this chunk
         */
        ham_offset_t pageid;
		pageid = addr - (addr % env_get_pagesize(env));

        if (page) {
            if (page_get_self(page)!=pageid)
                page=0;
        }

        /*
         * is it the current page? if not, try to fetch the page from
         * the cache - but only read the page from disk, if the 
         * chunk is small
         */
        if (!page) {
            st=env_fetch_page(&page, env, pageid, 
                    __blob_from_cache(env, size) ? 0 : DB_ONLY_FROM_CACHE);
			ham_assert(st ? !page : 1, (0));
            /* blob pages don't have a page header */
            if (page)
                page_set_npers_flags(page, 
                    page_get_npers_flags(page)|PAGE_NPERS_NO_HEADER);
			else if (st)
				return st;
        }

        /*
         * if we have a page pointer: use it; otherwise read directly
         * from the device
         */
        if (page) {
            ham_size_t readstart=
                    (ham_size_t)(addr-page_get_self(page));
            ham_size_t readsize =
                    (ham_size_t)(env_get_pagesize(env)-readstart);
            if (readsize>size)
                readsize=size;
            memcpy(data, &page_get_raw_payload(page)[readstart], readsize);
            addr+=readsize;
            data+=readsize;
            size-=readsize;
        }
        else {
            ham_size_t s=(size<env_get_pagesize(env) 
                    ? size : env_get_pagesize(env));
            /* limit to the next page boundary */
            if (s>pageid+env_get_pagesize(env)-addr)
                s=(ham_size_t)(pageid+env_get_pagesize(env)-addr);

            st=device->read(device, addr, data, s);
            if (st) 
                return st;
            addr+=s;
            data+=s;
            size-=s;
        }
    }

    if (fpage)
        *fpage=page;

    return (0);
}
ham_status_t
txn_abort(ham_txn_t *txn, ham_u32_t flags)
{
    ham_status_t st;
    ham_env_t *env=txn_get_env(txn);

    /*
     * are cursors attached to this txn? if yes, fail
     */
    if (txn_get_cursor_refcount(txn)) {
        ham_trace(("transaction cannot be aborted till all attached "
                    "cursors are closed"));
        return HAM_CURSOR_STILL_OPEN;
    }

    if (env_get_log(env) && !(txn_get_flags(txn)&HAM_TXN_READ_ONLY)) {
        st=ham_log_append_txn_abort(env_get_log(env), txn);
        if (st) 
            return st;
    }

    env_set_txn(env, 0);

    /*
     * undo all operations from this transaction
     * 
     * this includes allocated pages (they're moved to the freelist), 
     * deleted pages (they're un-deleted) and other modifications (will
     * re-create the original page from the logfile)
     *
     * keep txn_get_pagelist(txn) intact during every round, so no 
     * local var for this one.
     */
    while (txn_get_pagelist(txn)) {
        ham_page_t *head = txn_get_pagelist(txn);

        if (!(flags & DO_NOT_NUKE_PAGE_STATS)) {
            /* 
             * nuke critical statistics, such as tracked outer bounds; imagine,
             * for example, a failing erase transaction which, through erasing 
             * the top-most key, lowers the actual upper bound, after which 
             * the transaction fails at some later point in life. Now if we 
             * wouldn't 'rewind' our bounds-statistics, we would have a 
             * situation where a subsequent out-of-bounds insert (~ append) 
             * would possibly FAIL due to the hinter using incorrect bounds 
             * information then!
             *
             * Hence we 'reverse' our statistics here and the easiest route 
             * is to just nuke the critical bits; subsequent find/insert/erase 
             * operations will ensure that the stats will get updated again, 
             * anyhow. All we loose then is a few subsequent operations, which 
             * might have been hinted if we had played a smarter game of 
             * statistics 'reversal'. Soit.
             */
			ham_db_t *db = page_get_owner(head);

			/*
			 * only need to do this for index pages anyhow, and those are the 
             * ones which have their 'ownership' set.
			 */
			if (db) {
				stats_page_is_nuked(db, head, HAM_FALSE); 
			}
        }

        ham_assert(page_is_in_list(txn_get_pagelist(txn), head, PAGE_LIST_TXN),
                             (0));
        txn_get_pagelist(txn) = page_list_remove(head, PAGE_LIST_TXN, head);

        /* if this page was allocated by this transaction, then we can
         * move the whole page to the freelist */
        if (page_get_alloc_txn_id(head)==txn_get_id(txn)) {
            (void)freel_mark_free(env, 0, page_get_self(head), 
                    env_get_pagesize(env), HAM_TRUE);
        }
        else {
            /* remove the 'delete pending' flag */
            page_set_npers_flags(head, 
                    page_get_npers_flags(head)&~PAGE_NPERS_DELETE_PENDING);

            /* if the page is dirty, and RECOVERY is enabled: recreate
             * the original, unmodified page from the log */
            if (env_get_log(env) && page_is_dirty(head)) {
                st=ham_log_recreate(env_get_log(env), head);
                if (st)
                    return (st);
                /*page_set_undirty(head); */
            }
        }

        /* page is no longer in use */
        page_release_ref(head);
    }

    ham_assert(txn_get_pagelist(txn)==0, (0));

    return (0);
}
void 
btree_insert_get_hints(insert_hints_t *hints, ham_db_t *db, ham_key_t *key)
{
    ham_runtime_statistics_dbdata_t *dbdata = db_get_db_perf_data(db);
    ham_bt_cursor_t *cursor = (ham_bt_cursor_t *)hints->cursor;

    ham_assert(hints->force_append == HAM_FALSE, (0));
    ham_assert(hints->force_prepend == HAM_FALSE, (0));
    ham_assert(hints->try_fast_track == HAM_FALSE, (0));

    if ((hints->flags & HAM_HINT_APPEND) && (cursor))
    {
        if (!bt_cursor_is_nil(cursor))
        {
            ham_assert(bt_cursor_is_nil(cursor)==0, ("cursor must not be nil"));
            ham_assert(db == bt_cursor_get_db(cursor), (0));

            /*
             fetch the page of the cursor. We deem the cost of an uncoupled cursor 
             too high as that implies calling a full-fledged key search on the
             given key - which can be rather costly - so we rather wait for the
             statistical cavalry a little later on in this program then.
             */
            if (bt_cursor_get_flags(cursor) & BT_CURSOR_FLAG_COUPLED) 
            {
                ham_page_t *page = bt_cursor_get_coupled_page(cursor);
                btree_node_t *node = ham_page_get_btree_node(page);
                ham_assert(btree_node_is_leaf(node), 
                            ("cursor points to internal node"));
                //ham_assert(!btree_node_get_right(node), ("cursor points to leaf node which is NOT the uppermost/last one"));
                /*
                 * if cursor is not coupled to the LAST (right-most) leaf
                 * in the Database, it does not make sense to append
                 */
                if (btree_node_get_right(node)) {
                    hints->force_append = HAM_FALSE;
                    hints->try_fast_track = HAM_FALSE;
                }
                else {
                    hints->leaf_page_addr = page_get_self(page);
                    hints->force_append = HAM_TRUE;
                    hints->try_fast_track = HAM_TRUE;
                }
            }
        }
    }
    else if ((hints->flags & HAM_HINT_PREPEND) && (cursor))
    {
        if (!bt_cursor_is_nil(cursor))
        {
            ham_assert(bt_cursor_is_nil(cursor)==0, ("cursor must not be nil"));
            ham_assert(db == bt_cursor_get_db(cursor), (0));

            /*
             fetch the page of the cursor. We deem the cost of an uncoupled cursor 
             too high as that implies calling a full-fledged key search on the
             given key - which can be rather costly - so we rather wait for the
             statistical cavalry a little later on in this program then.
             */
            if (bt_cursor_get_flags(cursor) & BT_CURSOR_FLAG_COUPLED) 
            {
                ham_page_t *page = bt_cursor_get_coupled_page(cursor);
                btree_node_t *node = ham_page_get_btree_node(page);
                ham_assert(btree_node_is_leaf(node), 
                        ("cursor points to internal node"));
                //ham_assert(!btree_node_get_left(node), ("cursor points to leaf node which is NOT the lowest/first one"));
                /*
                 * if cursor is not coupled to the FIRST (left-most) leaf
                 * in the Database, it does not make sense to prepend
                 */
                if (btree_node_get_left(node)) {
                    hints->force_prepend = HAM_FALSE;
                    hints->try_fast_track = HAM_FALSE;
                }
                else {
                    hints->leaf_page_addr = page_get_self(page);
                    hints->force_prepend = HAM_TRUE;
                    hints->try_fast_track = HAM_TRUE;
                }
            }
        }
    }
    //hints->flags &= ~(HAM_HINT_APPEND | HAM_HINT_PREPEND);

    /* 
    The statistical cavalry:

    - when the given key is positioned beyond the end, hint 'append' anyway.

    - When the given key is positioned before the start, hint 'prepend' anyway.
    
    NOTE: This 'auto-detect' mechanism (thanks to the key bounds being collected through
    the statistics gathering calls) renders the manual option HAM_HINT_APPEND/_PREPEND
    somewhat obsolete, really. 

    The only advantage of manually specifying HAM_HINT_APPEND/_PREPEND is that it can save you
    two key comparisons in here.
    */
    ham_assert(!(key->_flags & KEY_IS_EXTENDED), (0));
    key->_flags &= ~KEY_IS_EXTENDED;

    if (!hints->try_fast_track)
    {
        ham_runtime_statistics_opdbdata_t *opstats = db_get_op_perf_data(db, HAM_OPERATION_STATS_INSERT);

        ham_assert(opstats != NULL, (0));

        if (hints->flags & (HAM_HINT_APPEND | HAM_HINT_PREPEND))
        {
            /* find specific: APPEND / PREPEND --> SEQUENTIAL */
            hints->flags &= ~(HAM_HINT_APPEND | HAM_HINT_PREPEND); 
            hints->flags |= HAM_HINT_SEQUENTIAL;
        }

        if ((hints->flags & HAM_HINTS_MASK) == 0)
        {
            /* no local preference specified; go with the DB-wide DAM config */
            switch (db_get_data_access_mode(db) & ~HAM_DAM_ENFORCE_PRE110_FORMAT)
            {
            default:
                break;

            case HAM_DAM_SEQUENTIAL_INSERT:
                hints->flags |= HAM_HINT_SEQUENTIAL;
                break;
            }
        }

        switch (hints->flags & HAM_HINTS_MASK)
        {
        default:
        case HAM_HINT_RANDOM_ACCESS:
            /* do not provide any hints for the fast track */
            break;

        case HAM_HINT_SEQUENTIAL:
            /*
            when we have more than 4 hits on the same page already, we'll assume this one 
            will end up there as well. As this counter will reset itself on the first FAIL,
            there's no harm in acting this quick. In pathological cases, the worst what
            can happen is that in 20% of cases there will be performed an extra check on
            a cached btree leaf node, which is still minimal overhead then.
            */
            if (opstats->btree_last_page_sq_hits >= 3)
            {
                hints->leaf_page_addr = opstats->btree_last_page_addr;
                hints->try_fast_track = HAM_TRUE;
                break;
            }
            /* fall through! */
            if (0)
            {
        case HAM_HINT_SEQUENTIAL | HAM_HINT_UBER_FAST_ACCESS:
            /* same as above, but now act as fast as possible on this info */
            if (opstats->btree_last_page_sq_hits >= 1)
            {
                hints->leaf_page_addr = opstats->btree_last_page_addr;
                hints->try_fast_track = HAM_TRUE;
                break;
            }
            }
            {
                /* 
                we assume this request is located near the previous request, so we check
                if there's anything in the statistics that can help out.

                Note #1: since the hinting counts are 'aged' down to a value of 0..1K (with 2K peak),
                we don't need to use a 64-bit integer for the ratio calculation here.

                Note #2: the ratio is only 'trustworthy' when the base count is about 4 or higher.
                This is because the aging rounds up while scaling down, which means one single FAIL
                can get you a ratio as large as 50% when total count is 1 as well, due to
                either startup or aging rescale; without this minimum size check, the ratio + aging
                would effectively stop the hinter from working after either an aging step or when
                a few FAILs happen during the initial few FIND operations (startup condition).

                EDIT: the above bit about the hinter stopping due to too much FAIL at start or after
                rescale does NOT apply any more as the hinter now also includes checks which trigger
                when a (small) series of hits on the same page are found, which acts as a restarter
                for this as well.
                */
                ham_u32_t ratio = opstats->btree_hinting_fail_count;

                ratio = ratio * 1000 / (1 + opstats->btree_hinting_count);
                if (ratio < 200)
                {
                    hints->leaf_page_addr = opstats->btree_last_page_addr;
                    hints->try_fast_track = HAM_TRUE;
                    hints->force_append = HAM_TRUE;
                }
            }
            
            if (dbdata->lower_bound_set)
            {
                if (dbdata->lower_bound_index == 1)
                {
                    /*
                    impossible index: this is a marker to signal the table 
                    is completely empty
                    */
                    //hints->flags |= HAM_HINT_PREPEND;
                    hints->force_prepend = HAM_TRUE;
                    hints->leaf_page_addr = dbdata->lower_bound_page_address;
                    hints->try_fast_track = HAM_TRUE;
                }
                else
                {
                    int cmp;
                    
                    ham_assert(dbdata->lower_bound_index == 0, (0));
                    ham_assert(dbdata->lower_bound.data == NULL ?
                        dbdata->lower_bound.size == 0 : 
                        dbdata->lower_bound.size > 0, (0));
                    ham_assert(dbdata->lower_bound_page_address != 0, (0));
                    cmp = db_compare_keys(db, key, &dbdata->lower_bound);

                    if (cmp < 0)
                    {
                        //hints->flags |= HAM_HINT_PREPEND;
                        hints->force_prepend = HAM_TRUE;
                        hints->leaf_page_addr = dbdata->lower_bound_page_address;
                        hints->try_fast_track = HAM_TRUE;
                    }
                }
            }

            if (dbdata->upper_bound_set)
            {
                int cmp;
                
                ham_assert(dbdata->upper_bound_index >= 0, (0));
                ham_assert(dbdata->upper_bound.data == NULL ?
                    dbdata->upper_bound.size == 0 : 
                    dbdata->upper_bound.size > 0, (0));
                ham_assert(dbdata->upper_bound_page_address != 0, (0));
                cmp = db_compare_keys(db, key, &dbdata->upper_bound);

                if (cmp > 0)
                {
                    //hints->flags |= HAM_HINT_APPEND;
                    hints->force_append = HAM_TRUE;
                    hints->leaf_page_addr = dbdata->upper_bound_page_address;
                    hints->try_fast_track = HAM_TRUE;
                }
            }
            break;
        }
    }

    /* 
    we don't yet hint about jumping to the last accessed leaf node immediately (yet)
    
    EDIT:
    
    now we do: see the flags + dam code above: this happens when neither PREPEND 
    nor APPEND hints are specified 
    */
}
ham_page_t * 
cache_get_unused_page(ham_cache_t *cache)
{
    ham_page_t *page;
    ham_page_t *head;
    ham_page_t *min = 0;
    ham_size_t hash;

    page=cache_get_garbagelist(cache);
    if (page) {
        ham_assert(page_get_refcount(page)==0, 
                ("page is in use and in garbage list"));
        cache_set_garbagelist(cache, 
                page_list_remove(cache_get_garbagelist(cache), 
                    PAGE_LIST_GARBAGE, page));

        cache_set_cur_elements(cache, 
                cache_get_cur_elements(cache)-1);
        return (page);
    }

    head=cache_get_totallist(cache);
    if (!head)
        return (0);

    /*
     * Oh, this was all so unfair! <grin>
     *
     * As pages are added at the HEAD and NEXT for page P points to the
     * next older item (i.e. the previously added item to the linked
     * list), it means ->NEXT means
     * 'older'.
     *
     * While, in finding the oldest, re-usable page, we should /start/
     * with the oldest and gradually progress towards the 'younger',
     * i.e. traverse the link list in reverse, by traversing the ->PREV chain 
     * instead of the usual ->NEXT chain!
     *
     * And our 'proper' starting point then would be the
     * /oldest/ fella in the chain, and that would've beem HEAD->PREV
     * if we'd had cyclic double linked lists here; alas, we have NOT,
     * so we just travel down the ->NEXT path and pick the oldest geezer we can
     * find; after all that's one traversal with the same result as first
     * traveling all the way down to the endstop, and then reversing
     * through ->PREV... If only our cyclic LL patch hadn't caused such
     * weird bugs  :-(
     */
    page = head;
    do {
        /* only handle unused pages */
        if (page_get_refcount(page)==0) {
            if (page_get_cache_cntr(page)==0) {
                min=page;
                //goto found_page;
            }
            else {
                if (!min)
                    min=page;
                else if (page_get_cache_cntr(page) <= page_get_cache_cntr(min)) 
                {
                    /* oldest! */
                    min=page;
                }
            }
#if 0
            /*
             * This is not an equal opportunity scheme!
             *
             * Pages at the front of the list will be decremented
             * continuously (once every round) and have therefor a far
             * larger chance of getting 'unused'/re-used than pages at
             * the end of the chain, as those almost never will get
             * their counters decremented.
             *
             * Instead, we have an alternative mechanism, where we
             * always count UP, UNTIL... we hit a global high water mark
             * --> decrement ALL pages by the same amount, so that we
             * have some headroom again.
             */
            page_decrement_cache_cntr(page, 1);
#endif
        }
        page=page_get_next(page, PAGE_LIST_CACHED);
        ham_assert(page != head, (0));
    } while (page && page!=head);
    
    if (!min)
        return (0);

    hash=__calc_hash(cache, page_get_self(min));

    ham_assert(page_is_in_list(cache_get_totallist(cache), min, 
                    PAGE_LIST_CACHED), (0));
    cache_set_totallist(cache, 
            page_list_remove(cache_get_totallist(cache), 
            PAGE_LIST_CACHED, min));
    ham_assert(page_is_in_list(cache_get_bucket(cache, hash), min, 
                    PAGE_LIST_BUCKET), (0));
    cache_set_bucket(cache, hash, 
            page_list_remove(cache_get_bucket(cache, 
            hash), PAGE_LIST_BUCKET, min));

    cache_set_cur_elements(cache, 
            cache_get_cur_elements(cache)-1);

    return (min);
}
Beispiel #13
0
static ham_status_t
__insert_split(ham_page_t *page, ham_key_t *key, 
        ham_offset_t rid, insert_scratchpad_t *scratchpad, 
        insert_hints_t *hints)
{
    int cmp;
    ham_status_t st;
    ham_page_t *newpage, *oldsib;
    int_key_t *nbte, *obte;
    btree_node_t *nbtp, *obtp, *sbtp;
    ham_size_t count, keysize;
    ham_db_t *db=page_get_owner(page);
    ham_env_t *env = db_get_env(db);
    ham_key_t pivotkey, oldkey;
    ham_offset_t pivotrid;
    ham_u16_t pivot;
    ham_bool_t pivot_at_end=HAM_FALSE;

    ham_assert(page_get_owner(page), (0));
    ham_assert(device_get_env(page_get_device(page)) 
            == db_get_env(page_get_owner(page)), (0));

    ham_assert(hints->force_append == HAM_FALSE, (0));

    keysize=db_get_keysize(db);

    /*
     * allocate a new page
     */
    hints->cost++;
    st=db_alloc_page(&newpage, db, PAGE_TYPE_B_INDEX, 0); 
    ham_assert(st ? page == NULL : 1, (0));
    ham_assert(!st ? page  != NULL : 1, (0));
    if (st)
        return st; 
    ham_assert(page_get_owner(newpage), (""));
    /* clear the node header */
    memset(page_get_payload(newpage), 0, sizeof(btree_node_t));

    stats_page_is_nuked(db, page, HAM_TRUE);

    /*
     * move half of the key/rid-tuples to the new page
     *
     * !! recno: keys are sorted; we do a "lazy split"
     */
    nbtp=ham_page_get_btree_node(newpage);
    nbte=btree_node_get_key(db, nbtp, 0);
    obtp=ham_page_get_btree_node(page);
    obte=btree_node_get_key(db, obtp, 0);
    count=btree_node_get_count(obtp);

    /*
     * for databases with sequential access (this includes recno databases):
     * do not split in the middle, but at the very end of the page
     *
     * if this page is the right-most page in the index, and this key is 
     * inserted at the very end, then we select the same pivot as for
     * sequential access
     */
    if (db_get_data_access_mode(db)&HAM_DAM_SEQUENTIAL_INSERT)
        pivot_at_end=HAM_TRUE;
    else if (btree_node_get_right(obtp)==0) {
        cmp=key_compare_pub_to_int(db, page, key, btree_node_get_count(obtp)-1);
        if (cmp>0)
            pivot_at_end=HAM_TRUE;
    }

    /*
     * internal pages set the count of the new page to count-pivot-1 (because
     * the pivot element will become ptr_left of the new page).
     * by using pivot=count-2 we make sure that at least 1 element will remain
     * in the new node.
     */
    if (pivot_at_end) {
        pivot=count-2;
    }
    else {
        pivot=count/2;
    }

    /*
     * uncouple all cursors
     */
    st=bt_uncouple_all_cursors(page, pivot);
    if (st)
        return (st);

    /*
     * if we split a leaf, we'll insert the pivot element in the leaf
     * page, too. in internal nodes, we don't insert it, but propagate
     * it to the parent node only.
     */
    if (btree_node_is_leaf(obtp)) {
        hints->cost += stats_memmove_cost((db_get_int_key_header_size()+keysize)*(count-pivot));
        memcpy((char *)nbte,
               ((char *)obte)+(db_get_int_key_header_size()+keysize)*pivot, 
               (db_get_int_key_header_size()+keysize)*(count-pivot));
    }
    else {
        hints->cost += stats_memmove_cost((db_get_int_key_header_size()+keysize)*(count-pivot-1));
        memcpy((char *)nbte,
               ((char *)obte)+(db_get_int_key_header_size()+keysize)*(pivot+1), 
               (db_get_int_key_header_size()+keysize)*(count-pivot-1));
    }
    
    /* 
     * store the pivot element, we'll need it later to propagate it 
     * to the parent page
     */
    nbte=btree_node_get_key(db, obtp, pivot);

    memset(&pivotkey, 0, sizeof(pivotkey));
    memset(&oldkey, 0, sizeof(oldkey));
    oldkey.data=key_get_key(nbte);
    oldkey.size=key_get_size(nbte);
    oldkey._flags=key_get_flags(nbte);
    st = util_copy_key(db, &oldkey, &pivotkey);
    if (st) 
    {
        (void)db_free_page(newpage, DB_MOVE_TO_FREELIST);
        goto fail_dramatically;
    }
    pivotrid=page_get_self(newpage);

    /*
     * adjust the page count
     */
    if (btree_node_is_leaf(obtp)) {
        btree_node_set_count(obtp, pivot);
        btree_node_set_count(nbtp, count-pivot);
    }
    else {
        btree_node_set_count(obtp, pivot);
        btree_node_set_count(nbtp, count-pivot-1);
    }

    /*
     * if we're in an internal page: fix the ptr_left of the new page
     * (it points to the ptr of the pivot key)
     */ 
    if (!btree_node_is_leaf(obtp)) {
        /* 
         * nbte still contains the pivot key 
         */
        btree_node_set_ptr_left(nbtp, key_get_ptr(nbte));
    }

    /*
     * insert the new element
     */
    hints->cost++;
    cmp=key_compare_pub_to_int(db, page, key, pivot);
    if (cmp < -1) 
    {
        st = (ham_status_t)cmp;
        goto fail_dramatically;
    }

    if (cmp>=0)
        st=__insert_nosplit(newpage, key, rid, 
                scratchpad->record, scratchpad->cursor, hints);
    else
        st=__insert_nosplit(page, key, rid, 
                scratchpad->record, scratchpad->cursor, hints);
    if (st) 
    {
        goto fail_dramatically;
    }
    scratchpad->cursor=0; /* don't overwrite cursor if __insert_nosplit
                             is called again */

    /*
     * fix the double-linked list of pages, and mark the pages as dirty
     */
    if (btree_node_get_right(obtp)) 
    {
        st=db_fetch_page(&oldsib, db, btree_node_get_right(obtp), 0);
        if (st)
            goto fail_dramatically;
    }
    else
    {
        oldsib=0;
    }

    if (oldsib) {
        st=ham_log_add_page_before(oldsib);
        if (st)
            goto fail_dramatically;
    }

    btree_node_set_left (nbtp, page_get_self(page));
    btree_node_set_right(nbtp, btree_node_get_right(obtp));
    btree_node_set_right(obtp, page_get_self(newpage));
    if (oldsib) {
        sbtp=ham_page_get_btree_node(oldsib);
        btree_node_set_left(sbtp, page_get_self(newpage));
        page_set_dirty(oldsib, env);
    }
    page_set_dirty(newpage, env);
    page_set_dirty(page, env);

    /* 
     * propagate the pivot key to the parent page
     */
    ham_assert(!(scratchpad->key.flags & HAM_KEY_USER_ALLOC), (0));
    if (scratchpad->key.data)
        allocator_free(env_get_allocator(env), scratchpad->key.data);
    scratchpad->key=pivotkey;
    scratchpad->rid=pivotrid;
    ham_assert(!(scratchpad->key.flags & HAM_KEY_USER_ALLOC), (0));

    return (SPLIT);

fail_dramatically:
    ham_assert(!(pivotkey.flags & HAM_KEY_USER_ALLOC), (0));
    if (pivotkey.data)
        allocator_free(env_get_allocator(env), pivotkey.data);
    return st;
}
Beispiel #14
0
static ham_status_t
__insert_cursor(ham_btree_t *be, ham_key_t *key, ham_record_t *record, 
                ham_bt_cursor_t *cursor, insert_hints_t *hints)
{
    ham_status_t st;
    ham_page_t *root;
    ham_db_t *db=be_get_db(be);
    ham_env_t *env = db_get_env(db);
    insert_scratchpad_t scratchpad;

    ham_assert(hints->force_append == HAM_FALSE, (0));
    ham_assert(hints->force_prepend == HAM_FALSE, (0));

    /* 
     * initialize the scratchpad 
     */
    memset(&scratchpad, 0, sizeof(scratchpad));
    scratchpad.be=be;
    scratchpad.record=record;
    scratchpad.cursor=cursor;

    /* 
     * get the root-page...
     */
    ham_assert(btree_get_rootpage(be)!=0, ("btree has no root page"));
    st=db_fetch_page(&root, db, btree_get_rootpage(be), 0);
    ham_assert(st ? root == NULL : 1, (0));
    if (st)
        return st;

    /* 
     * ... and start the recursion 
     */
    st=__insert_recursive(root, key, 0, &scratchpad, hints);

    /*
     * if the root page was split, we have to create a new
     * root page.
     */
    if (st==SPLIT) {
        ham_page_t *newroot;
        btree_node_t *node;

        /*
         * the root-page will be changed...
         */
        st=ham_log_add_page_before(root);
        if (st)
            return (st);

        /*
         * allocate a new root page
         */
        st=db_alloc_page(&newroot, db, PAGE_TYPE_B_ROOT, 0); 
        ham_assert(st ? newroot == NULL : 1, (0));
        if (st)
            return (st);
        ham_assert(page_get_owner(newroot), (""));
        /* clear the node header */
        memset(page_get_payload(newroot), 0, sizeof(btree_node_t));

        stats_page_is_nuked(db, root, HAM_TRUE);

        /* 
         * insert the pivot element and the ptr_left
         */ 
        node=ham_page_get_btree_node(newroot);
        btree_node_set_ptr_left(node, btree_get_rootpage(be));
        st=__insert_nosplit(newroot, &scratchpad.key, 
                scratchpad.rid, scratchpad.record, scratchpad.cursor, 
                hints);
        ham_assert(!(scratchpad.key.flags & HAM_KEY_USER_ALLOC), (0));
        scratchpad.cursor=0; /* don't overwrite cursor if __insert_nosplit
                                is called again */
        if (st) {
            ham_assert(!(scratchpad.key.flags & HAM_KEY_USER_ALLOC), (0));
            if (scratchpad.key.data)
                allocator_free(env_get_allocator(env), scratchpad.key.data);
            return (st);
        }

        /*
         * set the new root page
         *
         * !!
         * do NOT delete the old root page - it's still in use!
         *
         * also don't forget to flush the backend - otherwise the header
         * page of the database will not contain the updated information.
         * The backend is flushed when the database is closed, but if 
         * recovery is enabled then the flush here is critical.
         */
        btree_set_rootpage(be, page_get_self(newroot));
        be_set_dirty(be, HAM_TRUE);
        be->_fun_flush(be);

        /*
         * As we re-purpose a page, we will reset its pagecounter
         * as well to signal its first use as the new type assigned
         * here.
         */
        if (env_get_cache(env) && (page_get_type(root)!=PAGE_TYPE_B_INDEX))
            cache_update_page_access_counter(root, env_get_cache(env), 0);

        page_set_type(root, PAGE_TYPE_B_INDEX);
        page_set_dirty(root, env);
        page_set_dirty(newroot, env);

        /* the root page was modified (btree_set_rootpage) - make sure that
         * it's logged */
        if (env_get_rt_flags(env)&HAM_ENABLE_RECOVERY) {
            st=txn_add_page(env_get_txn(env), env_get_header_page(env),
                    HAM_TRUE);
            if (st)
                return (st);
        }
    }

    /*
     * release the scratchpad-memory and return to caller
     */
    ham_assert(!(scratchpad.key.flags & HAM_KEY_USER_ALLOC), (0));
    if (scratchpad.key.data)
        allocator_free(env_get_allocator(env), scratchpad.key.data);

    return (st);
}
Beispiel #15
0
/**
 * Allocate space in storage for and write the content references by 'data'
 * (and length 'size') to storage.
 * 
 * Conditions will apply whether the data is written through cache or direct
 * to device.
 * 
 * The content is, of course, prefixed by a BLOB header.
 * 
 * Partial writes are handled in this function.
 */
ham_status_t
blob_allocate(ham_env_t *env, ham_db_t *db, ham_record_t *record,
        ham_u32_t flags, ham_offset_t *blobid)
{
    ham_status_t st;
    ham_page_t *page=0;
    ham_offset_t addr;
    blob_t hdr;
    ham_u8_t *chunk_data[2];
    ham_size_t alloc_size;
    ham_size_t chunk_size[2];
    ham_device_t *device=env_get_device(env);
    ham_bool_t freshly_created = HAM_FALSE;
   
    *blobid=0;

    /*
     * PARTIAL WRITE
     * 
     * if offset+partial_size equals the full record size, then we won't 
     * have any gaps. In this case we just write the full record and ignore
     * the partial parameters.
     */
    if (flags&HAM_PARTIAL) {
        if (record->partial_offset==0 
                && record->partial_offset+record->partial_size==record->size)
            flags&=~HAM_PARTIAL;
    }

    /*
     * in-memory-database: the blobid is actually a pointer to the memory
     * buffer, in which the blob (with the blob-header) is stored
     */
    if (env_get_rt_flags(env)&HAM_IN_MEMORY_DB) {
        blob_t *hdr;
        ham_u8_t *p=(ham_u8_t *)allocator_alloc(env_get_allocator(env), 
                                    record->size+sizeof(blob_t));
        if (!p) {
            return HAM_OUT_OF_MEMORY;
        }

        /* initialize the header */
        hdr=(blob_t *)p;
        memset(hdr, 0, sizeof(*hdr));
        blob_set_self(hdr, (ham_offset_t)PTR_TO_U64(p));
        blob_set_alloc_size(hdr, record->size+sizeof(blob_t));
        blob_set_size(hdr, record->size);

        /* do we have gaps? if yes, fill them with zeroes */
        if (flags&HAM_PARTIAL) {
            ham_u8_t *s=p+sizeof(blob_t);
            if (record->partial_offset)
                memset(s, 0, record->partial_offset);
            memcpy(s+record->partial_offset,
                    record->data, record->partial_size);
            if (record->partial_offset+record->partial_size<record->size)
                memset(s+record->partial_offset+record->partial_size, 0, 
                    record->size-(record->partial_offset+record->partial_size));
        }
        else {
            memcpy(p+sizeof(blob_t), record->data, record->size);
        }

        *blobid=(ham_offset_t)PTR_TO_U64(p);
        return (0);
    }

    memset(&hdr, 0, sizeof(hdr));

    /*
     * blobs are CHUNKSIZE-allocated 
     */
    alloc_size=sizeof(blob_t)+record->size;
    alloc_size += DB_CHUNKSIZE - 1;
    alloc_size -= alloc_size % DB_CHUNKSIZE;

    /* 
     * check if we have space in the freelist 
     */
    st = freel_alloc_area(&addr, env, db, alloc_size);
    if (!addr) 
    {
        if (st)
            return st;

        /*
         * if the blob is small AND if logging is disabled: load the page 
         * through the cache
         */
        if (__blob_from_cache(env, alloc_size)) {
            st = db_alloc_page(&page, db, PAGE_TYPE_BLOB, 
                        PAGE_IGNORE_FREELIST);
			ham_assert(st ? page == NULL : 1, (0));
			ham_assert(!st ? page  != NULL : 1, (0));
            if (st)
                return st;
            /* blob pages don't have a page header */
            page_set_npers_flags(page, 
                    page_get_npers_flags(page)|PAGE_NPERS_NO_HEADER);
            addr=page_get_self(page);
            /* move the remaining space to the freelist */
            (void)freel_mark_free(env, db, addr+alloc_size,
                    env_get_pagesize(env)-alloc_size, HAM_FALSE);
            blob_set_alloc_size(&hdr, alloc_size);
        }
        else {
            /*
             * otherwise use direct IO to allocate the space
             */
            ham_size_t aligned=alloc_size;
            aligned += env_get_pagesize(env) - 1;
            aligned -= aligned % env_get_pagesize(env);

            st=device->alloc(device, aligned, &addr);
            if (st) 
                return (st);

            /* if aligned!=size, and the remaining chunk is large enough:
             * move it to the freelist */
            {
                ham_size_t diff=aligned-alloc_size;
                if (diff > SMALLEST_CHUNK_SIZE) {
                    (void)freel_mark_free(env, db, addr+alloc_size, 
                            diff, HAM_FALSE);
                    blob_set_alloc_size(&hdr, aligned-diff);
                }
                else {
                    blob_set_alloc_size(&hdr, aligned);
                }
            }
            freshly_created = HAM_TRUE;
        }

        ham_assert(HAM_SUCCESS == freel_check_area_is_allocated(env, db,
                    addr, alloc_size), (0));
    }
    else {
		ham_assert(!st, (0));
        blob_set_alloc_size(&hdr, alloc_size);
    }

    blob_set_size(&hdr, record->size);
    blob_set_self(&hdr, addr);

    /*
     * PARTIAL WRITE
     *
     * are there gaps at the beginning? If yes, then we'll fill with zeros
     */
    if ((flags&HAM_PARTIAL) && (record->partial_offset)) {
        ham_u8_t *ptr;
        ham_size_t gapsize=record->partial_offset;

        ptr=allocator_calloc(env_get_allocator(env), 
                                    gapsize > env_get_pagesize(env)
                                        ? env_get_pagesize(env)
                                        : gapsize);
        if (!ptr)
            return (HAM_OUT_OF_MEMORY);

        /* 
         * first: write the header
         */
        chunk_data[0]=(ham_u8_t *)&hdr;
        chunk_size[0]=sizeof(hdr);
        st=__write_chunks(env, page, addr, HAM_TRUE, freshly_created, 
                        chunk_data, chunk_size, 1);
        if (st)
            return (st);

        addr+=sizeof(hdr);

        /* now fill the gap; if the gap is bigger than a pagesize we'll
         * split the gap into smaller chunks 
         */
        while (gapsize>=env_get_pagesize(env)) {
            chunk_data[0]=ptr;
            chunk_size[0]=env_get_pagesize(env);
            st=__write_chunks(env, page, addr, HAM_TRUE, 
                    freshly_created, chunk_data, chunk_size, 1);
            if (st)
                break;
            gapsize-=env_get_pagesize(env);
            addr+=env_get_pagesize(env);
        }

        /* fill the remaining gap */
        if (gapsize) {
            chunk_data[0]=ptr;
            chunk_size[0]=gapsize;

            st=__write_chunks(env, page, addr, HAM_TRUE, freshly_created, 
                            chunk_data, chunk_size, 1);
            if (st)
                return (st);
            addr+=gapsize;
        }

        allocator_free(env_get_allocator(env), ptr);

        /* now write the "real" data */
        chunk_data[0]=(ham_u8_t *)record->data;
        chunk_size[0]=record->partial_size;

        st=__write_chunks(env, page, addr, HAM_TRUE, freshly_created, 
                        chunk_data, chunk_size, 1);
        if (st)
            return (st);
        addr+=record->partial_size;
    }
    else {
        /* 
         * not writing partially: write header and data, then we're done
         */
        chunk_data[0]=(ham_u8_t *)&hdr;
        chunk_size[0]=sizeof(hdr);
        chunk_data[1]=(ham_u8_t *)record->data;
        chunk_size[1]=(flags&HAM_PARTIAL) 
                        ? record->partial_size 
                        : record->size;

        st=__write_chunks(env, page, addr, HAM_TRUE, freshly_created, 
                        chunk_data, chunk_size, 2);
        if (st)
            return (st);
        addr+=sizeof(hdr)+
            ((flags&HAM_PARTIAL) ? record->partial_size : record->size);
    }

    /*
     * store the blobid; it will be returned to the caller
     */
    *blobid=blob_get_self(&hdr);

    /*
     * PARTIAL WRITES:
     *
     * if we have gaps at the end of the blob: just append more chunks to
     * fill these gaps. Since they can be pretty large we split them into
     * smaller chunks if necessary.
     */
    if (flags&HAM_PARTIAL) {
        if (record->partial_offset+record->partial_size < record->size) {
            ham_u8_t *ptr;
            ham_size_t gapsize=record->size
                            - (record->partial_offset+record->partial_size);

            /* now fill the gap; if the gap is bigger than a pagesize we'll
             * split the gap into smaller chunks 
             *
             * we split this loop in two - the outer loop will allocate the
             * memory buffer, thus saving some allocations
             */
            while (gapsize>env_get_pagesize(env)) {
                ham_u8_t *ptr=allocator_calloc(env_get_allocator(env), 
                                            env_get_pagesize(env));
                if (!ptr)
                    return (HAM_OUT_OF_MEMORY);
                while (gapsize>env_get_pagesize(env)) {
                    chunk_data[0]=ptr;
                    chunk_size[0]=env_get_pagesize(env);
                    st=__write_chunks(env, page, addr, HAM_TRUE, 
                            freshly_created, chunk_data, chunk_size, 1);
                    if (st)
                        break;
                    gapsize-=env_get_pagesize(env);
                    addr+=env_get_pagesize(env);
                }
                allocator_free(env_get_allocator(env), ptr);
                if (st)
                    return (st);
            }
            
            /* now write the remainder, which is less than a pagesize */
            ham_assert(gapsize<env_get_pagesize(env), (""));

            chunk_size[0]=gapsize;
            ptr=chunk_data[0]=allocator_calloc(env_get_allocator(env), gapsize);
            if (!ptr)
                return (HAM_OUT_OF_MEMORY);

            st=__write_chunks(env, page, addr, HAM_TRUE, freshly_created, 
                        chunk_data, chunk_size, 1);
            allocator_free(env_get_allocator(env), ptr);
            if (st)
                return (st);
        }
    }

    return (0);
}
void 
stats_update_any_bound(ham_db_t *db, struct ham_page_t *page, ham_key_t *key, ham_u32_t find_flags, ham_s32_t slot)
{
    ham_status_t st;
    ham_runtime_statistics_dbdata_t *dbdata = db_get_db_perf_data(db);
    ham_env_t *env = db_get_env(db);
    btree_node_t *node = ham_page_get_btree_node(page);

    ham_assert(env_get_allocator(env) != 0, (0));
    ham_assert(btree_node_is_leaf(node), (0));
    if (!btree_node_get_left(node))
    {
        /* this is the leaf page which carries the lower bound key */
        ham_assert(btree_node_get_count(node) == 0 ? !btree_node_get_right(node) : 1, (0));
        if (btree_node_get_count(node) == 0)
        {
            /* range is empty 
             *
             * do not set the lower/upper boundary; otherwise we may trigger
             * a key comparison with an empty key, and the comparison function
             * could not be fit to handle this.

             EDIT: the code should be able to handle that particular situation
                   as this was tested a while ago. Besides, the settings here
                   are a signal for the hinter the table is currently 
                   completely empty and no btree traversal whatsoever is 
                   needed before we find, insert or erase.

             EDIT #2: custom compare routines may b0rk on the data NULL pointers
                   (the monster test comparison function does, for example),
                   so the smarter thing to do here is NOT set the bounds here.

                   The trouble with that approach is that the hinter no longer
                   'knows about' an empty table, but is that so bad? An empty
                   table would constitute only a btree root node anyway, so the
                   regular traversal would be quick anyhow.
             */
            if (dbdata->lower_bound_index != 1
                || dbdata->upper_bound_index != 0)
            {
                /* only set when not done already */
                if (dbdata->lower_bound.data)
                    allocator_free(env_get_allocator(env), dbdata->lower_bound.data);
                if (dbdata->upper_bound.data)
                    allocator_free(env_get_allocator(env), dbdata->upper_bound.data);
                memset(&dbdata->lower_bound, 0, sizeof(dbdata->lower_bound));
                memset(&dbdata->upper_bound, 0, sizeof(dbdata->upper_bound));
                dbdata->lower_bound_index = 1; /* impossible value for lower bound index */
                dbdata->upper_bound_index = 0;
                dbdata->lower_bound_page_address = page_get_self(page);
                dbdata->upper_bound_page_address = 0; /* page_get_self(page); */
                dbdata->lower_bound_set = HAM_TRUE;
                dbdata->upper_bound_set = HAM_FALSE; /* cannot be TRUE or subsequent updates for single record carrying tables may fail */
                //ham_assert(dbdata->lower_bound.data != NULL, (0));
                ham_assert(dbdata->lower_bound_page_address != 0, (0));
            }
        }
        else
        {
            /*
            lower bound key is always located at index [0]

            update our key info when either our current data is undefined (startup condition)
            or the first key was edited in some way (slot == 0). This 'copy anyway' approach 
            saves us one costly key comparison.
            */
            if (dbdata->lower_bound_index != 0
                || dbdata->lower_bound_page_address != page_get_self(page)
                || slot == 0)
            {
                page_add_ref(page);

                /* only set when not done already */
                dbdata->lower_bound_set = HAM_TRUE;
                dbdata->lower_bound_index = 0;
                dbdata->lower_bound_page_address = page_get_self(page);

                if (dbdata->lower_bound.data) {
                    allocator_free(env_get_allocator(env), dbdata->lower_bound.data);
                    dbdata->lower_bound.data=0;
                    dbdata->lower_bound.size=0;
                }

                st = util_copy_key_int2pub(db, 
                    btree_node_get_key(db, node, dbdata->lower_bound_index),
                    &dbdata->lower_bound);
                if (st) 
                {
                    /* panic! is case of failure, just drop the lower bound 
                     * entirely. */
                    if (dbdata->lower_bound.data)
                        allocator_free(env_get_allocator(env), dbdata->lower_bound.data);
                    memset(&dbdata->lower_bound, 0, 
                            sizeof(dbdata->lower_bound));
                    dbdata->lower_bound_index = 0;
                    dbdata->lower_bound_page_address = 0;
                    dbdata->lower_bound_set = HAM_FALSE;
                }
                else
                {
                    ham_assert(dbdata->lower_bound.data == NULL ?
                        dbdata->lower_bound.size == 0 : 
                        dbdata->lower_bound.size > 0, (0));
                    ham_assert(dbdata->lower_bound_page_address != 0, (0));
                }
                page_release_ref(page);
            }
        }
    }

    if (!btree_node_get_right(node)) 
    {
        /* this is the leaf page which carries the upper bound key */
        ham_assert(btree_node_get_count(node) == 0 
                ? !btree_node_get_left(node) 
                : 1, (0));
        if (btree_node_get_count(node) != 0) 
        {
            /* 
             * range is non-empty; the other case has already been handled 
             * above upper bound key is always located at index [size-1] 
             * update our key info when either our current data is 
             * undefined (startup condition) or the last key was edited in 
             * some way (slot == size-1). This 'copy anyway' approach 
             * saves us one costly key comparison.
             */
            if (dbdata->upper_bound_index != btree_node_get_count(node) - 1
                    || dbdata->upper_bound_page_address != page_get_self(page)
                    || slot == btree_node_get_count(node) - 1) 
            {
                page_add_ref(page);

                /* only set when not done already */
                dbdata->upper_bound_set = HAM_TRUE;
                dbdata->upper_bound_index = btree_node_get_count(node) - 1;
                dbdata->upper_bound_page_address = page_get_self(page);

                if (dbdata->upper_bound.data) {
                    allocator_free(env_get_allocator(env), dbdata->upper_bound.data);
                    dbdata->upper_bound.data=0;
                    dbdata->upper_bound.size=0;
                }

                st = util_copy_key_int2pub(db, 
                    btree_node_get_key(db, node, dbdata->upper_bound_index),
                    &dbdata->upper_bound);
                if (st) 
                {
                    /* panic! is case of failure, just drop the upper bound 
                     * entirely. */
                    if (dbdata->upper_bound.data)
                        allocator_free(env_get_allocator(env), dbdata->upper_bound.data);
                    memset(&dbdata->upper_bound, 0, 
                            sizeof(dbdata->upper_bound));
                    dbdata->upper_bound_index = 0;
                    dbdata->upper_bound_page_address = 0;
                    dbdata->upper_bound_set = HAM_FALSE;
                }
                page_release_ref(page);
            }
        }
    }
}
Beispiel #17
0
/**
 * write a series of data chunks to storage at file offset 'addr'.
 * 
 * The chunks are assumed to be stored in sequential order, adjacent
 * to each other, i.e. as one long data strip.
 * 
 * Writing is performed on a per-page basis, where special conditions
 * will decide whether or not the write operation is performed
 * through the page cache or directly to device; such is determined 
 * on a per-page basis.
 */
static ham_status_t
__write_chunks(ham_env_t *env, ham_page_t *page, ham_offset_t addr, 
        ham_bool_t allocated, ham_bool_t freshly_created, 
        ham_u8_t **chunk_data, ham_size_t *chunk_size, 
        ham_size_t chunks)
{
    ham_size_t i;
    ham_status_t st;
    ham_offset_t pageid;
    ham_device_t *device=env_get_device(env);
	ham_size_t pagesize = env_get_pagesize(env);

    ham_assert(freshly_created ? allocated : 1, (0));

    /*
     * for each chunk...
     */
    for (i=0; i<chunks; i++) {
        while (chunk_size[i]) {
            /*
             * get the page-ID from this chunk
             */
            pageid = addr - (addr % pagesize);

            /*
             * is this the current page?
             */
            if (page && page_get_self(page)!=pageid)
                page=0;

            /*
             * fetch the page from the cache, if it's in the cache
             * (unless we're logging - in this case always go through
             * the buffered routines)
             */
            if (!page) {
                /*
                 * keep pages in cache when they are located at the 'edges' of 
                 * the blob, as they MAY be accessed for different data.
                 * Of course, when a blob is small, there's only one (partial) 
                 * page accessed anyhow, so that one should end up in cache 
                 * then.
                 *
                 * When transaction logging is turned on, it's the same story, 
                 * really. We _could_ keep all those pages in cache now,
                 * but this would be thrashing the cache with blob data that's 
                 * accessed once only and for transaction abort (or commit)
                 * the amount of effort does not change.
                 *
                 * THOUGHT:
                 *
                 * Do we actually care what was in that page, which is going 
                 * to be overwritten in its entirety, BEFORE we do this, i.e. 
                 * before the transaction? 
                 *
                 * Answer: NO (and YES in special circumstances).
                 *
                 * Elaboration: As this would have been free space before, the 
                 * actual content does not matter, so it's not required to add
                 * the FULL pages written by the blob write action here to the 
                 * transaction log: even on transaction abort, that lingering 
                 * data is marked as 'bogus'/free as it was before anyhow.
                 *
                 * And then, assuming a longer running transaction, where this 
                 * page was freed during a previous action WITHIN
                 * the transaction, well, than the transaction log should 
                 * already carry this page's previous content as instructed 
                 * by the erase operation. HOWEVER, the erase operation would 
                 * not have a particular NEED to edit this page, as an erase op 
                 * is complete by just marking this space as free in the 
                 * freelist, resulting in the freelist pages (and the btree 
                 * pages) being the only ones being edited and ending up in 
                 * the transaction log then.
                 *
                 * Which means we'll have to log the previous content of these 
                 * pages to the transaction log anyhow. UNLESS, that is, when
                 * WE allocated these pages in the first place: then there 
                 * cannot be any 'pre-transaction' state of these pages 
                 * except that of 'not existing', i.e. 'free'. In which case, 
                 * their actual content doesn't matter! (freshly_created)
                 *
                 * And what if we have recovery logging turned on, but it's 
                 * not about an active transaction here?
                 * In that case, the recovery log would only log the OLD page 
                 * content, which we've concluded is insignificant, ever. Of 
                 * course, that's assuming (again!) that we're writing to 
                 * freshly created pages, which no-one has seen before. 
                 *
                 * Just as long as we can prevent this section from thrashing 
                 * the page cache, thank you very much...
                 */
                ham_bool_t at_blob_edge = (__blob_from_cache(env, chunk_size[i])
                        || (addr % pagesize) != 0 
                        || chunk_size[i] < pagesize);
                ham_bool_t cacheonly = (!at_blob_edge 
                                    && (!env_get_log(env)
                                        || freshly_created));
				//ham_assert(db_get_txn(db) ? !!env_get_log(env) : 1, (0));

                st=env_fetch_page(&page, env, pageid, 
                        cacheonly ? DB_ONLY_FROM_CACHE : 
                        at_blob_edge ? 0 : DB_NEW_PAGE_DOES_THRASH_CACHE);
				ham_assert(st ? !page : 1, (0));
                /* blob pages don't have a page header */
                if (page)
                {
                    page_set_npers_flags(page, 
                        page_get_npers_flags(page)|PAGE_NPERS_NO_HEADER);
                    /* if this page was recently allocated by the parent
                     * function: set a flag */
                    if (cacheonly 
                            && allocated 
                            && addr==page_get_self(page) 
                            && env_get_txn(env))
                        page_set_alloc_txn_id(page, txn_get_id(env_get_txn(env)));
                }
                else if (st) {
                    return st;
                }
            }

            /*
             * if we have a page pointer: use it; otherwise write directly
             * to the device
             */
            if (page) {
                ham_size_t writestart=
                        (ham_size_t)(addr-page_get_self(page));
                ham_size_t writesize =
                        (ham_size_t)(pagesize - writestart);
                if (writesize>chunk_size[i])
                    writesize=chunk_size[i];
                if ((st=ham_log_add_page_before(page)))
                    return (st);
                memcpy(&page_get_raw_payload(page)[writestart], chunk_data[i],
                            writesize);
                page_set_dirty(page, env);
                addr+=writesize;
                chunk_data[i]+=writesize;
                chunk_size[i]-=writesize;
            }
            else {
                ham_size_t s = chunk_size[i];
                /* limit to the next page boundary */
                if (s > pageid+pagesize-addr)
                    s = (ham_size_t)(pageid+pagesize-addr);

                ham_assert(env_get_log(env) ? freshly_created : 1, (0));

                st=device->write(device, addr, chunk_data[i], s);
                if (st)
                    return st;
                addr+=s;
                chunk_data[i]+=s;
                chunk_size[i]-=s;
            }
        }
    }

    return (0);
}
/**                                                                 
 * create and initialize a new backend                              
 *                                                                  
 * @remark this function is called after the @a ham_db_t structure  
 * and the file were created                                        
 *                                                                  
 * the @a flags are stored in the database; only transfer           
 * the persistent flags!                                            
 *
 * @note This is a B+-tree 'backend' method.
 */                                                                 
static ham_status_t 
my_fun_create(ham_btree_t *be, ham_u16_t keysize, ham_u32_t flags)
{
    ham_status_t st;
    ham_page_t *root;
    ham_size_t maxkeys;
    ham_db_t *db=be_get_db(be);
    db_indexdata_t *indexdata=env_get_indexdata_ptr(db_get_env(db), 
                                db_get_indexdata_offset(db));
    if (be_is_active(be))
    {
        ham_trace(("backend has alread been initialized before!"));
        /* HAM_INTERNAL_ERROR -- not really, when keeping custom 
         * backends in mind */
        return HAM_ALREADY_INITIALIZED; 
    }

    /* 
     * prevent overflow - maxkeys only has 16 bit! 
     */
    maxkeys=btree_calc_maxkeys(env_get_pagesize(db_get_env(db)), keysize);
    if (maxkeys>MAX_KEYS_PER_NODE) {
        ham_trace(("keysize/pagesize ratio too high"));
        return HAM_INV_KEYSIZE;
    }
    else if (maxkeys==0) {
        ham_trace(("keysize too large for the current pagesize"));
        return HAM_INV_KEYSIZE;
    }

    /*
     * allocate a new root page
     */
    st=db_alloc_page(&root, db, PAGE_TYPE_B_ROOT, PAGE_IGNORE_FREELIST);
    ham_assert(st ? root == NULL : 1, (0));
    ham_assert(!st ? root != NULL : 1, (0));
    if (!root)
        return st ? st : HAM_INTERNAL_ERROR;

    memset(page_get_raw_payload(root), 0, 
            sizeof(btree_node_t)+sizeof(ham_perm_page_union_t));

    /*
     * calculate the maximum number of keys for this page, 
     * and make sure that this number is even
     */
    btree_set_maxkeys(be, (ham_u16_t)maxkeys);
    be_set_dirty(be, HAM_TRUE);
    be_set_keysize(be, keysize);
    be_set_flags(be, flags);

    btree_set_rootpage(be, page_get_self(root));

    index_clear_reserved(indexdata);
    index_set_max_keys(indexdata, (ham_u16_t)maxkeys);
    index_set_keysize(indexdata, keysize);
    index_set_self(indexdata, page_get_self(root));
    index_set_flags(indexdata, flags);
    index_set_recno(indexdata, 0);
    index_clear_reserved(indexdata);

    env_set_dirty(db_get_env(db));

    be_set_active(be, HAM_TRUE);

    return (0);
}