Пример #1
0
static ham_status_t
__insert_cursor(ham_btree_t *be, ham_key_t *key, ham_record_t *record, 
                ham_bt_cursor_t *cursor, insert_hints_t *hints)
{
    ham_status_t st;
    ham_page_t *root;
    ham_db_t *db=be_get_db(be);
    ham_env_t *env = db_get_env(db);
    insert_scratchpad_t scratchpad;

    ham_assert(hints->force_append == HAM_FALSE, (0));
    ham_assert(hints->force_prepend == HAM_FALSE, (0));

    /* 
     * initialize the scratchpad 
     */
    memset(&scratchpad, 0, sizeof(scratchpad));
    scratchpad.be=be;
    scratchpad.record=record;
    scratchpad.cursor=cursor;

    /* 
     * get the root-page...
     */
    ham_assert(btree_get_rootpage(be)!=0, ("btree has no root page"));
    st=db_fetch_page(&root, db, btree_get_rootpage(be), 0);
    ham_assert(st ? root == NULL : 1, (0));
    if (st)
        return st;

    /* 
     * ... and start the recursion 
     */
    st=__insert_recursive(root, key, 0, &scratchpad, hints);

    /*
     * if the root page was split, we have to create a new
     * root page.
     */
    if (st==SPLIT) {
        ham_page_t *newroot;
        btree_node_t *node;

        /*
         * the root-page will be changed...
         */
        st=ham_log_add_page_before(root);
        if (st)
            return (st);

        /*
         * allocate a new root page
         */
        st=db_alloc_page(&newroot, db, PAGE_TYPE_B_ROOT, 0); 
        ham_assert(st ? newroot == NULL : 1, (0));
        if (st)
            return (st);
        ham_assert(page_get_owner(newroot), (""));
        /* clear the node header */
        memset(page_get_payload(newroot), 0, sizeof(btree_node_t));

        stats_page_is_nuked(db, root, HAM_TRUE);

        /* 
         * insert the pivot element and the ptr_left
         */ 
        node=ham_page_get_btree_node(newroot);
        btree_node_set_ptr_left(node, btree_get_rootpage(be));
        st=__insert_nosplit(newroot, &scratchpad.key, 
                scratchpad.rid, scratchpad.record, scratchpad.cursor, 
                hints);
        ham_assert(!(scratchpad.key.flags & HAM_KEY_USER_ALLOC), (0));
        scratchpad.cursor=0; /* don't overwrite cursor if __insert_nosplit
                                is called again */
        if (st) {
            ham_assert(!(scratchpad.key.flags & HAM_KEY_USER_ALLOC), (0));
            if (scratchpad.key.data)
                allocator_free(env_get_allocator(env), scratchpad.key.data);
            return (st);
        }

        /*
         * set the new root page
         *
         * !!
         * do NOT delete the old root page - it's still in use!
         *
         * also don't forget to flush the backend - otherwise the header
         * page of the database will not contain the updated information.
         * The backend is flushed when the database is closed, but if 
         * recovery is enabled then the flush here is critical.
         */
        btree_set_rootpage(be, page_get_self(newroot));
        be_set_dirty(be, HAM_TRUE);
        be->_fun_flush(be);

        /*
         * As we re-purpose a page, we will reset its pagecounter
         * as well to signal its first use as the new type assigned
         * here.
         */
        if (env_get_cache(env) && (page_get_type(root)!=PAGE_TYPE_B_INDEX))
            cache_update_page_access_counter(root, env_get_cache(env), 0);

        page_set_type(root, PAGE_TYPE_B_INDEX);
        page_set_dirty(root, env);
        page_set_dirty(newroot, env);

        /* the root page was modified (btree_set_rootpage) - make sure that
         * it's logged */
        if (env_get_rt_flags(env)&HAM_ENABLE_RECOVERY) {
            st=txn_add_page(env_get_txn(env), env_get_header_page(env),
                    HAM_TRUE);
            if (st)
                return (st);
        }
    }

    /*
     * release the scratchpad-memory and return to caller
     */
    ham_assert(!(scratchpad.key.flags & HAM_KEY_USER_ALLOC), (0));
    if (scratchpad.key.data)
        allocator_free(env_get_allocator(env), scratchpad.key.data);

    return (st);
}
Пример #2
0
/**
 * write a series of data chunks to storage at file offset 'addr'.
 *
 * The chunks are assumed to be stored in sequential order, adjacent
 * to each other, i.e. as one long data strip.
 *
 * Writing is performed on a per-page basis, where special conditions
 * will decide whether or not the write operation is performed
 * through the page cache or directly to device; such is determined
 * on a per-page basis.
 */
static ham_status_t
__write_chunks(ham_env_t *env, ham_page_t *page, ham_offset_t addr,
               ham_bool_t allocated, ham_bool_t freshly_created,
               ham_u8_t **chunk_data, ham_size_t *chunk_size,
               ham_size_t chunks)
{
    ham_size_t i;
    ham_status_t st;
    ham_offset_t pageid;
    ham_device_t *device=env_get_device(env);
    ham_size_t pagesize = env_get_pagesize(env);

    ham_assert(freshly_created ? allocated : 1, (0));

    /*
     * for each chunk...
     */
    for (i=0; i<chunks; i++) {
        while (chunk_size[i]) {
            /*
             * get the page-ID from this chunk
             */
            pageid = addr - (addr % pagesize);

            /*
             * is this the current page?
             */
            if (page && page_get_self(page)!=pageid)
                page=0;

            /*
             * fetch the page from the cache, if it's in the cache
             * (unless we're logging - in this case always go through
             * the buffered routines)
             */
            if (!page) {
                /*
                 * keep pages in cache when they are located at the 'edges' of
                 * the blob, as they MAY be accessed for different data.
                 * Of course, when a blob is small, there's only one (partial)
                 * page accessed anyhow, so that one should end up in cache
                 * then.
                 *
                 * When transaction logging is turned on, it's the same story,
                 * really. We _could_ keep all those pages in cache now,
                 * but this would be thrashing the cache with blob data that's
                 * accessed once only and for transaction abort (or commit)
                 * the amount of effort does not change.
                 *
                 * THOUGHT:
                 *
                 * Do we actually care what was in that page, which is going
                 * to be overwritten in its entirety, BEFORE we do this, i.e.
                 * before the transaction?
                 *
                 * Answer: NO (and YES in special circumstances).
                 *
                 * Elaboration: As this would have been free space before, the
                 * actual content does not matter, so it's not required to add
                 * the FULL pages written by the blob write action here to the
                 * transaction log: even on transaction abort, that lingering
                 * data is marked as 'bogus'/free as it was before anyhow.
                 *
                 * And then, assuming a longer running transaction, where this
                 * page was freed during a previous action WITHIN
                 * the transaction, well, than the transaction log should
                 * already carry this page's previous content as instructed
                 * by the erase operation. HOWEVER, the erase operation would
                 * not have a particular NEED to edit this page, as an erase op
                 * is complete by just marking this space as free in the
                 * freelist, resulting in the freelist pages (and the btree
                 * pages) being the only ones being edited and ending up in
                 * the transaction log then.
                 *
                 * Which means we'll have to log the previous content of these
                 * pages to the transaction log anyhow. UNLESS, that is, when
                 * WE allocated these pages in the first place: then there
                 * cannot be any 'pre-transaction' state of these pages
                 * except that of 'not existing', i.e. 'free'. In which case,
                 * their actual content doesn't matter! (freshly_created)
                 *
                 * And what if we have recovery logging turned on, but it's
                 * not about an active transaction here?
                 * In that case, the recovery log would only log the OLD page
                 * content, which we've concluded is insignificant, ever. Of
                 * course, that's assuming (again!) that we're writing to
                 * freshly created pages, which no-one has seen before.
                 *
                 * Just as long as we can prevent this section from thrashing
                 * the page cache, thank you very much...
                 */
                ham_bool_t at_blob_edge = (__blob_from_cache(env, chunk_size[i])
                                           || (addr % pagesize) != 0
                                           || chunk_size[i] < pagesize);
                ham_bool_t cacheonly = (!at_blob_edge
                                        && (!env_get_log(env)
                                            || freshly_created));
                //ham_assert(db_get_txn(db) ? !!env_get_log(env) : 1, (0));

                st=env_fetch_page(&page, env, pageid,
                                  cacheonly ? DB_ONLY_FROM_CACHE :
                                  at_blob_edge ? 0 : DB_NEW_PAGE_DOES_THRASH_CACHE);
                ham_assert(st ? !page : 1, (0));
                /* blob pages don't have a page header */
                if (page)
                {
                    page_set_npers_flags(page,
                                         page_get_npers_flags(page)|PAGE_NPERS_NO_HEADER);
                    /* if this page was recently allocated by the parent
                     * function: set a flag */
                    if (cacheonly
                            && allocated
                            && addr==page_get_self(page)
                            && env_get_txn(env))
                        page_set_alloc_txn_id(page, txn_get_id(env_get_txn(env)));
                }
                else if (st) {
                    return st;
                }
            }

            /*
             * if we have a page pointer: use it; otherwise write directly
             * to the device
             */
            if (page) {
                ham_size_t writestart=
                    (ham_size_t)(addr-page_get_self(page));
                ham_size_t writesize =
                    (ham_size_t)(pagesize - writestart);
                if (writesize>chunk_size[i])
                    writesize=chunk_size[i];
                if ((st=ham_log_add_page_before(page)))
                    return (st);
                memcpy(&page_get_raw_payload(page)[writestart], chunk_data[i],
                       writesize);
                page_set_dirty(page, env);
                addr+=writesize;
                chunk_data[i]+=writesize;
                chunk_size[i]-=writesize;
            }
            else {
                ham_size_t s = chunk_size[i];
                /* limit to the next page boundary */
                if (s > pageid+pagesize-addr)
                    s = (ham_size_t)(pageid+pagesize-addr);

                ham_assert(env_get_log(env) ? freshly_created : 1, (0));

                st=device->write(device, addr, chunk_data[i], s);
                if (st)
                    return st;
                addr+=s;
                chunk_data[i]+=s;
                chunk_size[i]-=s;
            }
        }
    }

    return (0);
}
Пример #3
0
static ham_status_t
__append_key(ham_btree_t *be, ham_key_t *key, ham_record_t *record, 
                ham_bt_cursor_t *cursor, insert_hints_t *hints)
{
    ham_status_t st=0;
    ham_page_t *page;
    btree_node_t *node;
    ham_db_t *db;

#ifdef HAM_DEBUG
    if (cursor && !bt_cursor_is_nil(cursor)) {
        ham_assert(be_get_db(be) == bt_cursor_get_db(cursor), (0));
    }
#endif

    db = be_get_db(be);

    /* 
     * see if we get this btree leaf; if not, revert to regular scan 
     *    
     * As this is a speed-improvement hint re-using recent material, the page 
     * should still sit in the cache, or we're using old info, which should be 
     * discarded.
     */
    st = db_fetch_page(&page, db, hints->leaf_page_addr, DB_ONLY_FROM_CACHE);
    if (st)
        return st;
    if (!page) {
        hints->force_append = HAM_FALSE;
        hints->force_prepend = HAM_FALSE;
        return (__insert_cursor(be, key, record, cursor, hints));
    }

    page_add_ref(page);
    node=ham_page_get_btree_node(page);
    ham_assert(btree_node_is_leaf(node), ("iterator points to internal node"));

    /*
     * if the page is already full OR this page is not the right-most page
     * when we APPEND or the left-most node when we PREPEND
     * OR the new key is not the highest key: perform a normal insert
     */
    if ((hints->force_append && btree_node_get_right(node))
            || (hints->force_prepend && btree_node_get_left(node))
            || btree_node_get_count(node) >= btree_get_maxkeys(be)) {
        page_release_ref(page);
        hints->force_append = HAM_FALSE;
        hints->force_prepend = HAM_FALSE;
        return (__insert_cursor(be, key, record, cursor, hints));
    }

    /*
     * if the page is not empty: check if we append the key at the end / start
     * (depending on force_append/force_prepend),
     * or if it's actually inserted in the middle (when neither force_append 
     * or force_prepend is specified: that'd be SEQUENTIAL insertion 
     * hinting somewhere in the middle of the total key range.
     */
    if (btree_node_get_count(node)!=0) {
        int cmp_hi;
        int cmp_lo;

        hints->cost++;
        if (!hints->force_prepend) {
            cmp_hi = key_compare_pub_to_int(db, page, key, 
                                btree_node_get_count(node)-1);
            /* key is in the middle */
            if (cmp_hi < -1) {
                page_release_ref(page);
                return (ham_status_t)cmp_hi;
            }
            /* key is at the end */
            if (cmp_hi > 0) {
                if (btree_node_get_right(node)) {
                    /* not at top end of the btree, so we can't do the 
                     * fast track */
                    page_release_ref(page);
                    //hints->flags &= ~HAM_HINT_APPEND;
                    hints->force_append = HAM_FALSE;
                    hints->force_prepend = HAM_FALSE;
                    return (__insert_cursor(be, key, record, cursor, hints));
                }

                hints->force_append = HAM_TRUE;
                hints->force_prepend = HAM_FALSE;
            }
        }
        else { /* hints->force_prepend is true */
            /* not bigger than the right-most node while we 
             * were trying to APPEND */
            cmp_hi = -1;
        }

        if (!hints->force_append) {
            cmp_lo = key_compare_pub_to_int(db, page, key, 0);
            /* in the middle range */
            if (cmp_lo < -1) {
                page_release_ref(page);
                return ((ham_status_t)cmp_lo);
            }
            /* key is at the start of page */
            if (cmp_lo < 0) {
                if (btree_node_get_left(node)) {
                    /* not at bottom end of the btree, so we can't 
                     * do the fast track */
                    page_release_ref(page);
                    //hints->flags &= ~HAM_HINT_PREPEND;
                    hints->force_append = HAM_FALSE;
                    hints->force_prepend = HAM_FALSE;
                    return (__insert_cursor(be, key, record, cursor, hints));
                }

                hints->force_append = HAM_FALSE;
                hints->force_prepend = HAM_TRUE;
            }
        }
        else { /* hints->force_prepend is true */
            /* not smaller than the left-most node while we were 
             * trying to PREPEND */
            cmp_lo = +1;
        }

        /* handle inserts in the middle range */
        if (cmp_lo >= 0 && cmp_hi <= 0) {
            /*
             * Depending on where we are in the btree, the current key either
             * is going to end up in the middle of the given node/page,
             * OR the given key is out of range of the given leaf node.
             */
            if (hints->force_append || hints->force_prepend) {
                /*
                 * when prepend or append is FORCED, we are expected to 
                 * add keys ONLY at the beginning or end of the btree
                 * key range. Clearly the current key does not fit that
                 * criterium.
                 */
                page_release_ref(page);
                //hints->flags &= ~HAM_HINT_PREPEND;
                hints->force_append = HAM_FALSE;
                hints->force_prepend = HAM_FALSE;
                return (__insert_cursor(be, key, record, cursor, hints));
            }

            /* 
             * we discovered that the key must be inserted in the middle 
             * of the current leaf.
             * 
             * It does not matter whether the current leaf is at the start or
             * end of the btree range; as we need to add the key in the middle
             * of the current leaf, that info alone is enough to continue with
             * the fast track insert operation.
             */
            ham_assert(!hints->force_prepend && !hints->force_append, (0));
        }

        ham_assert((hints->force_prepend + hints->force_append) < 2, 
                ("Either APPEND or PREPEND flag MAY be set, but not both"));
    }
    else { /* empty page: force insertion in slot 0 */
        hints->force_append = HAM_FALSE;
        hints->force_prepend = HAM_TRUE;
    }

    /*
     * the page will be changed - write it to the log (if a log exists)
     */
    st=ham_log_add_page_before(page);
    if (st) {
        page_release_ref(page);
        return (st);
    }

    /*
     * OK - we're really appending/prepending the new key.
     */
    ham_assert(hints->force_append || hints->force_prepend, (0));
    st=__insert_nosplit(page, key, 0, record, cursor, hints);

    page_release_ref(page);
    return (st);
}
Пример #4
0
ham_status_t
blob_duplicate_insert(ham_db_t *db, ham_offset_t table_id,
                      ham_record_t *record, ham_size_t position, ham_u32_t flags,
                      dupe_entry_t *entries, ham_size_t num_entries,
                      ham_offset_t *rid, ham_size_t *new_position)
{
    ham_status_t st=0;
    dupe_table_t *table=0;
    ham_bool_t alloc_table=0;
    ham_bool_t resize=0;
    ham_page_t *page=0;
    ham_env_t *env=db_get_env(db);

    /*
     * create a new duplicate table if none existed, and insert
     * the first entry
     */
    if (!table_id) {
        ham_assert(num_entries==2, (""));
        /* allocates space for 8 (!) entries */
        table=allocator_calloc(env_get_allocator(env),
                               sizeof(dupe_table_t)+7*sizeof(dupe_entry_t));
        if (!table)
            return HAM_OUT_OF_MEMORY;
        dupe_table_set_capacity(table, 8);
        dupe_table_set_count(table, 1);
        memcpy(dupe_table_get_entry(table, 0), &entries[0],
               sizeof(entries[0]));

        /* skip the first entry */
        entries++;
        num_entries--;
        alloc_table=1;
    }
    else {
        /*
         * otherwise load the existing table
         */
        st=__get_duplicate_table(&table, &page, env, table_id);
        ham_assert(st ? table == NULL : 1, (0));
        ham_assert(st ? page == NULL : 1, (0));
        if (!table)
            return st ? st : HAM_INTERNAL_ERROR;
        if (!page && !(env_get_rt_flags(env)&HAM_IN_MEMORY_DB))
            alloc_table=1;
    }

    if (page)
        if ((st=ham_log_add_page_before(page)))
            return (st);

    ham_assert(num_entries==1, (""));

    /*
     * resize the table, if necessary
     */
    if (!(flags & HAM_OVERWRITE)
            && dupe_table_get_count(table)+1>=dupe_table_get_capacity(table)) {
        dupe_table_t *old=table;
        ham_size_t new_cap=dupe_table_get_capacity(table);

        if (new_cap < 3*8)
            new_cap += 8;
        else
            new_cap += new_cap/3;

        table=allocator_calloc(env_get_allocator(env), sizeof(dupe_table_t)+
                               (new_cap-1)*sizeof(dupe_entry_t));
        if (!table)
            return (HAM_OUT_OF_MEMORY);
        dupe_table_set_capacity(table, new_cap);
        dupe_table_set_count(table, dupe_table_get_count(old));
        memcpy(dupe_table_get_entry(table, 0), dupe_table_get_entry(old, 0),
               dupe_table_get_count(old)*sizeof(dupe_entry_t));
        if (alloc_table)
            allocator_free(env_get_allocator(env), old);

        alloc_table=1;
        resize=1;
    }

    /*
     * insert sorted, unsorted or overwrite the entry at the requested position
     */
    if (flags&HAM_OVERWRITE) {
        dupe_entry_t *e=dupe_table_get_entry(table, position);

        if (!(dupe_entry_get_flags(e)&(KEY_BLOB_SIZE_SMALL
                                       |KEY_BLOB_SIZE_TINY
                                       |KEY_BLOB_SIZE_EMPTY))) {
            (void)blob_free(env, db, dupe_entry_get_rid(e), 0);
        }

        memcpy(dupe_table_get_entry(table, position),
               &entries[0], sizeof(entries[0]));
    }
    else {
        if (db_get_rt_flags(db)&HAM_SORT_DUPLICATES) {
            if (page)
                page_add_ref(page);
            position=__get_sorted_position(db, table, record, flags);
            if (page)
                page_release_ref(page);
            if (position<0)
                return ((ham_status_t)position);
        }
        else if (flags&HAM_DUPLICATE_INSERT_BEFORE) {
            /* do nothing, insert at the current position */
        }
        else if (flags&HAM_DUPLICATE_INSERT_AFTER) {
            position++;
            if (position > dupe_table_get_count(table))
                position=dupe_table_get_count(table);
        }
        else if (flags&HAM_DUPLICATE_INSERT_FIRST) {
            position=0;
        }
        else if (flags&HAM_DUPLICATE_INSERT_LAST) {
            position=dupe_table_get_count(table);
        }
        else {
            position=dupe_table_get_count(table);
        }

        if (position != dupe_table_get_count(table)) {
            memmove(dupe_table_get_entry(table, position+1),
                    dupe_table_get_entry(table, position),
                    sizeof(entries[0])*(dupe_table_get_count(table)-position));
        }

        memcpy(dupe_table_get_entry(table, position),
               &entries[0], sizeof(entries[0]));

        dupe_table_set_count(table, dupe_table_get_count(table)+1);
    }

    /*
     * write the table back to disk and return the blobid of the table
     */
    if ((table_id && !page) || resize)
    {
        ham_record_t rec= {0};
        rec.data=(ham_u8_t *)table;
        rec.size=sizeof(dupe_table_t)
                 +(dupe_table_get_capacity(table)-1)*sizeof(dupe_entry_t);
        st=blob_overwrite(env, db, table_id, &rec, 0, rid);
    }
    else if (!table_id)
    {
        ham_record_t rec= {0};
        rec.data=(ham_u8_t *)table;
        rec.size=sizeof(dupe_table_t)
                 +(dupe_table_get_capacity(table)-1)*sizeof(dupe_entry_t);
        st=blob_allocate(env, db, &rec, 0, rid);
    }
    else if (table_id && page)
    {
        page_set_dirty(page, env);
    }
    else
    {
        ham_assert(!"shouldn't be here", (0));
    }

    if (alloc_table)
        allocator_free(env_get_allocator(env), table);

    if (new_position)
        *new_position=position;

    return (st);
}