Ejemplo n.º 1
0
/**
 * Allocate space in storage for and write the content references by 'data'
 * (and length 'size') to storage.
 * 
 * Conditions will apply whether the data is written through cache or direct
 * to device.
 * 
 * The content is, of course, prefixed by a BLOB header.
 * 
 * Partial writes are handled in this function.
 */
ham_status_t
blob_allocate(ham_env_t *env, ham_db_t *db, ham_record_t *record,
        ham_u32_t flags, ham_offset_t *blobid)
{
    ham_status_t st;
    ham_page_t *page=0;
    ham_offset_t addr;
    blob_t hdr;
    ham_u8_t *chunk_data[2];
    ham_size_t alloc_size;
    ham_size_t chunk_size[2];
    ham_device_t *device=env_get_device(env);
    ham_bool_t freshly_created = HAM_FALSE;
   
    *blobid=0;

    /*
     * PARTIAL WRITE
     * 
     * if offset+partial_size equals the full record size, then we won't 
     * have any gaps. In this case we just write the full record and ignore
     * the partial parameters.
     */
    if (flags&HAM_PARTIAL) {
        if (record->partial_offset==0 
                && record->partial_offset+record->partial_size==record->size)
            flags&=~HAM_PARTIAL;
    }

    /*
     * in-memory-database: the blobid is actually a pointer to the memory
     * buffer, in which the blob (with the blob-header) is stored
     */
    if (env_get_rt_flags(env)&HAM_IN_MEMORY_DB) {
        blob_t *hdr;
        ham_u8_t *p=(ham_u8_t *)allocator_alloc(env_get_allocator(env), 
                                    record->size+sizeof(blob_t));
        if (!p) {
            return HAM_OUT_OF_MEMORY;
        }

        /* initialize the header */
        hdr=(blob_t *)p;
        memset(hdr, 0, sizeof(*hdr));
        blob_set_self(hdr, (ham_offset_t)PTR_TO_U64(p));
        blob_set_alloc_size(hdr, record->size+sizeof(blob_t));
        blob_set_size(hdr, record->size);

        /* do we have gaps? if yes, fill them with zeroes */
        if (flags&HAM_PARTIAL) {
            ham_u8_t *s=p+sizeof(blob_t);
            if (record->partial_offset)
                memset(s, 0, record->partial_offset);
            memcpy(s+record->partial_offset,
                    record->data, record->partial_size);
            if (record->partial_offset+record->partial_size<record->size)
                memset(s+record->partial_offset+record->partial_size, 0, 
                    record->size-(record->partial_offset+record->partial_size));
        }
        else {
            memcpy(p+sizeof(blob_t), record->data, record->size);
        }

        *blobid=(ham_offset_t)PTR_TO_U64(p);
        return (0);
    }

    memset(&hdr, 0, sizeof(hdr));

    /*
     * blobs are CHUNKSIZE-allocated 
     */
    alloc_size=sizeof(blob_t)+record->size;
    alloc_size += DB_CHUNKSIZE - 1;
    alloc_size -= alloc_size % DB_CHUNKSIZE;

    /* 
     * check if we have space in the freelist 
     */
    st = freel_alloc_area(&addr, env, db, alloc_size);
    if (!addr) 
    {
        if (st)
            return st;

        /*
         * if the blob is small AND if logging is disabled: load the page 
         * through the cache
         */
        if (__blob_from_cache(env, alloc_size)) {
            st = db_alloc_page(&page, db, PAGE_TYPE_BLOB, 
                        PAGE_IGNORE_FREELIST);
			ham_assert(st ? page == NULL : 1, (0));
			ham_assert(!st ? page  != NULL : 1, (0));
            if (st)
                return st;
            /* blob pages don't have a page header */
            page_set_npers_flags(page, 
                    page_get_npers_flags(page)|PAGE_NPERS_NO_HEADER);
            addr=page_get_self(page);
            /* move the remaining space to the freelist */
            (void)freel_mark_free(env, db, addr+alloc_size,
                    env_get_pagesize(env)-alloc_size, HAM_FALSE);
            blob_set_alloc_size(&hdr, alloc_size);
        }
        else {
            /*
             * otherwise use direct IO to allocate the space
             */
            ham_size_t aligned=alloc_size;
            aligned += env_get_pagesize(env) - 1;
            aligned -= aligned % env_get_pagesize(env);

            st=device->alloc(device, aligned, &addr);
            if (st) 
                return (st);

            /* if aligned!=size, and the remaining chunk is large enough:
             * move it to the freelist */
            {
                ham_size_t diff=aligned-alloc_size;
                if (diff > SMALLEST_CHUNK_SIZE) {
                    (void)freel_mark_free(env, db, addr+alloc_size, 
                            diff, HAM_FALSE);
                    blob_set_alloc_size(&hdr, aligned-diff);
                }
                else {
                    blob_set_alloc_size(&hdr, aligned);
                }
            }
            freshly_created = HAM_TRUE;
        }

        ham_assert(HAM_SUCCESS == freel_check_area_is_allocated(env, db,
                    addr, alloc_size), (0));
    }
    else {
		ham_assert(!st, (0));
        blob_set_alloc_size(&hdr, alloc_size);
    }

    blob_set_size(&hdr, record->size);
    blob_set_self(&hdr, addr);

    /*
     * PARTIAL WRITE
     *
     * are there gaps at the beginning? If yes, then we'll fill with zeros
     */
    if ((flags&HAM_PARTIAL) && (record->partial_offset)) {
        ham_u8_t *ptr;
        ham_size_t gapsize=record->partial_offset;

        ptr=allocator_calloc(env_get_allocator(env), 
                                    gapsize > env_get_pagesize(env)
                                        ? env_get_pagesize(env)
                                        : gapsize);
        if (!ptr)
            return (HAM_OUT_OF_MEMORY);

        /* 
         * first: write the header
         */
        chunk_data[0]=(ham_u8_t *)&hdr;
        chunk_size[0]=sizeof(hdr);
        st=__write_chunks(env, page, addr, HAM_TRUE, freshly_created, 
                        chunk_data, chunk_size, 1);
        if (st)
            return (st);

        addr+=sizeof(hdr);

        /* now fill the gap; if the gap is bigger than a pagesize we'll
         * split the gap into smaller chunks 
         */
        while (gapsize>=env_get_pagesize(env)) {
            chunk_data[0]=ptr;
            chunk_size[0]=env_get_pagesize(env);
            st=__write_chunks(env, page, addr, HAM_TRUE, 
                    freshly_created, chunk_data, chunk_size, 1);
            if (st)
                break;
            gapsize-=env_get_pagesize(env);
            addr+=env_get_pagesize(env);
        }

        /* fill the remaining gap */
        if (gapsize) {
            chunk_data[0]=ptr;
            chunk_size[0]=gapsize;

            st=__write_chunks(env, page, addr, HAM_TRUE, freshly_created, 
                            chunk_data, chunk_size, 1);
            if (st)
                return (st);
            addr+=gapsize;
        }

        allocator_free(env_get_allocator(env), ptr);

        /* now write the "real" data */
        chunk_data[0]=(ham_u8_t *)record->data;
        chunk_size[0]=record->partial_size;

        st=__write_chunks(env, page, addr, HAM_TRUE, freshly_created, 
                        chunk_data, chunk_size, 1);
        if (st)
            return (st);
        addr+=record->partial_size;
    }
    else {
        /* 
         * not writing partially: write header and data, then we're done
         */
        chunk_data[0]=(ham_u8_t *)&hdr;
        chunk_size[0]=sizeof(hdr);
        chunk_data[1]=(ham_u8_t *)record->data;
        chunk_size[1]=(flags&HAM_PARTIAL) 
                        ? record->partial_size 
                        : record->size;

        st=__write_chunks(env, page, addr, HAM_TRUE, freshly_created, 
                        chunk_data, chunk_size, 2);
        if (st)
            return (st);
        addr+=sizeof(hdr)+
            ((flags&HAM_PARTIAL) ? record->partial_size : record->size);
    }

    /*
     * store the blobid; it will be returned to the caller
     */
    *blobid=blob_get_self(&hdr);

    /*
     * PARTIAL WRITES:
     *
     * if we have gaps at the end of the blob: just append more chunks to
     * fill these gaps. Since they can be pretty large we split them into
     * smaller chunks if necessary.
     */
    if (flags&HAM_PARTIAL) {
        if (record->partial_offset+record->partial_size < record->size) {
            ham_u8_t *ptr;
            ham_size_t gapsize=record->size
                            - (record->partial_offset+record->partial_size);

            /* now fill the gap; if the gap is bigger than a pagesize we'll
             * split the gap into smaller chunks 
             *
             * we split this loop in two - the outer loop will allocate the
             * memory buffer, thus saving some allocations
             */
            while (gapsize>env_get_pagesize(env)) {
                ham_u8_t *ptr=allocator_calloc(env_get_allocator(env), 
                                            env_get_pagesize(env));
                if (!ptr)
                    return (HAM_OUT_OF_MEMORY);
                while (gapsize>env_get_pagesize(env)) {
                    chunk_data[0]=ptr;
                    chunk_size[0]=env_get_pagesize(env);
                    st=__write_chunks(env, page, addr, HAM_TRUE, 
                            freshly_created, chunk_data, chunk_size, 1);
                    if (st)
                        break;
                    gapsize-=env_get_pagesize(env);
                    addr+=env_get_pagesize(env);
                }
                allocator_free(env_get_allocator(env), ptr);
                if (st)
                    return (st);
            }
            
            /* now write the remainder, which is less than a pagesize */
            ham_assert(gapsize<env_get_pagesize(env), (""));

            chunk_size[0]=gapsize;
            ptr=chunk_data[0]=allocator_calloc(env_get_allocator(env), gapsize);
            if (!ptr)
                return (HAM_OUT_OF_MEMORY);

            st=__write_chunks(env, page, addr, HAM_TRUE, freshly_created, 
                        chunk_data, chunk_size, 1);
            allocator_free(env_get_allocator(env), ptr);
            if (st)
                return (st);
        }
    }

    return (0);
}
Ejemplo n.º 2
0
static ham_status_t
__insert_cursor(ham_btree_t *be, ham_key_t *key, ham_record_t *record, 
                ham_bt_cursor_t *cursor, insert_hints_t *hints)
{
    ham_status_t st;
    ham_page_t *root;
    ham_db_t *db=be_get_db(be);
    ham_env_t *env = db_get_env(db);
    insert_scratchpad_t scratchpad;

    ham_assert(hints->force_append == HAM_FALSE, (0));
    ham_assert(hints->force_prepend == HAM_FALSE, (0));

    /* 
     * initialize the scratchpad 
     */
    memset(&scratchpad, 0, sizeof(scratchpad));
    scratchpad.be=be;
    scratchpad.record=record;
    scratchpad.cursor=cursor;

    /* 
     * get the root-page...
     */
    ham_assert(btree_get_rootpage(be)!=0, ("btree has no root page"));
    st=db_fetch_page(&root, db, btree_get_rootpage(be), 0);
    ham_assert(st ? root == NULL : 1, (0));
    if (st)
        return st;

    /* 
     * ... and start the recursion 
     */
    st=__insert_recursive(root, key, 0, &scratchpad, hints);

    /*
     * if the root page was split, we have to create a new
     * root page.
     */
    if (st==SPLIT) {
        ham_page_t *newroot;
        btree_node_t *node;

        /*
         * the root-page will be changed...
         */
        st=ham_log_add_page_before(root);
        if (st)
            return (st);

        /*
         * allocate a new root page
         */
        st=db_alloc_page(&newroot, db, PAGE_TYPE_B_ROOT, 0); 
        ham_assert(st ? newroot == NULL : 1, (0));
        if (st)
            return (st);
        ham_assert(page_get_owner(newroot), (""));
        /* clear the node header */
        memset(page_get_payload(newroot), 0, sizeof(btree_node_t));

        stats_page_is_nuked(db, root, HAM_TRUE);

        /* 
         * insert the pivot element and the ptr_left
         */ 
        node=ham_page_get_btree_node(newroot);
        btree_node_set_ptr_left(node, btree_get_rootpage(be));
        st=__insert_nosplit(newroot, &scratchpad.key, 
                scratchpad.rid, scratchpad.record, scratchpad.cursor, 
                hints);
        ham_assert(!(scratchpad.key.flags & HAM_KEY_USER_ALLOC), (0));
        scratchpad.cursor=0; /* don't overwrite cursor if __insert_nosplit
                                is called again */
        if (st) {
            ham_assert(!(scratchpad.key.flags & HAM_KEY_USER_ALLOC), (0));
            if (scratchpad.key.data)
                allocator_free(env_get_allocator(env), scratchpad.key.data);
            return (st);
        }

        /*
         * set the new root page
         *
         * !!
         * do NOT delete the old root page - it's still in use!
         *
         * also don't forget to flush the backend - otherwise the header
         * page of the database will not contain the updated information.
         * The backend is flushed when the database is closed, but if 
         * recovery is enabled then the flush here is critical.
         */
        btree_set_rootpage(be, page_get_self(newroot));
        be_set_dirty(be, HAM_TRUE);
        be->_fun_flush(be);

        /*
         * As we re-purpose a page, we will reset its pagecounter
         * as well to signal its first use as the new type assigned
         * here.
         */
        if (env_get_cache(env) && (page_get_type(root)!=PAGE_TYPE_B_INDEX))
            cache_update_page_access_counter(root, env_get_cache(env), 0);

        page_set_type(root, PAGE_TYPE_B_INDEX);
        page_set_dirty(root, env);
        page_set_dirty(newroot, env);

        /* the root page was modified (btree_set_rootpage) - make sure that
         * it's logged */
        if (env_get_rt_flags(env)&HAM_ENABLE_RECOVERY) {
            st=txn_add_page(env_get_txn(env), env_get_header_page(env),
                    HAM_TRUE);
            if (st)
                return (st);
        }
    }

    /*
     * release the scratchpad-memory and return to caller
     */
    ham_assert(!(scratchpad.key.flags & HAM_KEY_USER_ALLOC), (0));
    if (scratchpad.key.data)
        allocator_free(env_get_allocator(env), scratchpad.key.data);

    return (st);
}
Ejemplo n.º 3
0
static ham_status_t
__insert_split(ham_page_t *page, ham_key_t *key, 
        ham_offset_t rid, insert_scratchpad_t *scratchpad, 
        insert_hints_t *hints)
{
    int cmp;
    ham_status_t st;
    ham_page_t *newpage, *oldsib;
    int_key_t *nbte, *obte;
    btree_node_t *nbtp, *obtp, *sbtp;
    ham_size_t count, keysize;
    ham_db_t *db=page_get_owner(page);
    ham_env_t *env = db_get_env(db);
    ham_key_t pivotkey, oldkey;
    ham_offset_t pivotrid;
    ham_u16_t pivot;
    ham_bool_t pivot_at_end=HAM_FALSE;

    ham_assert(page_get_owner(page), (0));
    ham_assert(device_get_env(page_get_device(page)) 
            == db_get_env(page_get_owner(page)), (0));

    ham_assert(hints->force_append == HAM_FALSE, (0));

    keysize=db_get_keysize(db);

    /*
     * allocate a new page
     */
    hints->cost++;
    st=db_alloc_page(&newpage, db, PAGE_TYPE_B_INDEX, 0); 
    ham_assert(st ? page == NULL : 1, (0));
    ham_assert(!st ? page  != NULL : 1, (0));
    if (st)
        return st; 
    ham_assert(page_get_owner(newpage), (""));
    /* clear the node header */
    memset(page_get_payload(newpage), 0, sizeof(btree_node_t));

    stats_page_is_nuked(db, page, HAM_TRUE);

    /*
     * move half of the key/rid-tuples to the new page
     *
     * !! recno: keys are sorted; we do a "lazy split"
     */
    nbtp=ham_page_get_btree_node(newpage);
    nbte=btree_node_get_key(db, nbtp, 0);
    obtp=ham_page_get_btree_node(page);
    obte=btree_node_get_key(db, obtp, 0);
    count=btree_node_get_count(obtp);

    /*
     * for databases with sequential access (this includes recno databases):
     * do not split in the middle, but at the very end of the page
     *
     * if this page is the right-most page in the index, and this key is 
     * inserted at the very end, then we select the same pivot as for
     * sequential access
     */
    if (db_get_data_access_mode(db)&HAM_DAM_SEQUENTIAL_INSERT)
        pivot_at_end=HAM_TRUE;
    else if (btree_node_get_right(obtp)==0) {
        cmp=key_compare_pub_to_int(db, page, key, btree_node_get_count(obtp)-1);
        if (cmp>0)
            pivot_at_end=HAM_TRUE;
    }

    /*
     * internal pages set the count of the new page to count-pivot-1 (because
     * the pivot element will become ptr_left of the new page).
     * by using pivot=count-2 we make sure that at least 1 element will remain
     * in the new node.
     */
    if (pivot_at_end) {
        pivot=count-2;
    }
    else {
        pivot=count/2;
    }

    /*
     * uncouple all cursors
     */
    st=bt_uncouple_all_cursors(page, pivot);
    if (st)
        return (st);

    /*
     * if we split a leaf, we'll insert the pivot element in the leaf
     * page, too. in internal nodes, we don't insert it, but propagate
     * it to the parent node only.
     */
    if (btree_node_is_leaf(obtp)) {
        hints->cost += stats_memmove_cost((db_get_int_key_header_size()+keysize)*(count-pivot));
        memcpy((char *)nbte,
               ((char *)obte)+(db_get_int_key_header_size()+keysize)*pivot, 
               (db_get_int_key_header_size()+keysize)*(count-pivot));
    }
    else {
        hints->cost += stats_memmove_cost((db_get_int_key_header_size()+keysize)*(count-pivot-1));
        memcpy((char *)nbte,
               ((char *)obte)+(db_get_int_key_header_size()+keysize)*(pivot+1), 
               (db_get_int_key_header_size()+keysize)*(count-pivot-1));
    }
    
    /* 
     * store the pivot element, we'll need it later to propagate it 
     * to the parent page
     */
    nbte=btree_node_get_key(db, obtp, pivot);

    memset(&pivotkey, 0, sizeof(pivotkey));
    memset(&oldkey, 0, sizeof(oldkey));
    oldkey.data=key_get_key(nbte);
    oldkey.size=key_get_size(nbte);
    oldkey._flags=key_get_flags(nbte);
    st = util_copy_key(db, &oldkey, &pivotkey);
    if (st) 
    {
        (void)db_free_page(newpage, DB_MOVE_TO_FREELIST);
        goto fail_dramatically;
    }
    pivotrid=page_get_self(newpage);

    /*
     * adjust the page count
     */
    if (btree_node_is_leaf(obtp)) {
        btree_node_set_count(obtp, pivot);
        btree_node_set_count(nbtp, count-pivot);
    }
    else {
        btree_node_set_count(obtp, pivot);
        btree_node_set_count(nbtp, count-pivot-1);
    }

    /*
     * if we're in an internal page: fix the ptr_left of the new page
     * (it points to the ptr of the pivot key)
     */ 
    if (!btree_node_is_leaf(obtp)) {
        /* 
         * nbte still contains the pivot key 
         */
        btree_node_set_ptr_left(nbtp, key_get_ptr(nbte));
    }

    /*
     * insert the new element
     */
    hints->cost++;
    cmp=key_compare_pub_to_int(db, page, key, pivot);
    if (cmp < -1) 
    {
        st = (ham_status_t)cmp;
        goto fail_dramatically;
    }

    if (cmp>=0)
        st=__insert_nosplit(newpage, key, rid, 
                scratchpad->record, scratchpad->cursor, hints);
    else
        st=__insert_nosplit(page, key, rid, 
                scratchpad->record, scratchpad->cursor, hints);
    if (st) 
    {
        goto fail_dramatically;
    }
    scratchpad->cursor=0; /* don't overwrite cursor if __insert_nosplit
                             is called again */

    /*
     * fix the double-linked list of pages, and mark the pages as dirty
     */
    if (btree_node_get_right(obtp)) 
    {
        st=db_fetch_page(&oldsib, db, btree_node_get_right(obtp), 0);
        if (st)
            goto fail_dramatically;
    }
    else
    {
        oldsib=0;
    }

    if (oldsib) {
        st=ham_log_add_page_before(oldsib);
        if (st)
            goto fail_dramatically;
    }

    btree_node_set_left (nbtp, page_get_self(page));
    btree_node_set_right(nbtp, btree_node_get_right(obtp));
    btree_node_set_right(obtp, page_get_self(newpage));
    if (oldsib) {
        sbtp=ham_page_get_btree_node(oldsib);
        btree_node_set_left(sbtp, page_get_self(newpage));
        page_set_dirty(oldsib, env);
    }
    page_set_dirty(newpage, env);
    page_set_dirty(page, env);

    /* 
     * propagate the pivot key to the parent page
     */
    ham_assert(!(scratchpad->key.flags & HAM_KEY_USER_ALLOC), (0));
    if (scratchpad->key.data)
        allocator_free(env_get_allocator(env), scratchpad->key.data);
    scratchpad->key=pivotkey;
    scratchpad->rid=pivotrid;
    ham_assert(!(scratchpad->key.flags & HAM_KEY_USER_ALLOC), (0));

    return (SPLIT);

fail_dramatically:
    ham_assert(!(pivotkey.flags & HAM_KEY_USER_ALLOC), (0));
    if (pivotkey.data)
        allocator_free(env_get_allocator(env), pivotkey.data);
    return st;
}
Ejemplo n.º 4
0
/**                                                                 
 * create and initialize a new backend                              
 *                                                                  
 * @remark this function is called after the @a ham_db_t structure  
 * and the file were created                                        
 *                                                                  
 * the @a flags are stored in the database; only transfer           
 * the persistent flags!                                            
 *
 * @note This is a B+-tree 'backend' method.
 */                                                                 
static ham_status_t 
my_fun_create(ham_btree_t *be, ham_u16_t keysize, ham_u32_t flags)
{
    ham_status_t st;
    ham_page_t *root;
    ham_size_t maxkeys;
    ham_db_t *db=be_get_db(be);
    db_indexdata_t *indexdata=env_get_indexdata_ptr(db_get_env(db), 
                                db_get_indexdata_offset(db));
    if (be_is_active(be))
    {
        ham_trace(("backend has alread been initialized before!"));
        /* HAM_INTERNAL_ERROR -- not really, when keeping custom 
         * backends in mind */
        return HAM_ALREADY_INITIALIZED; 
    }

    /* 
     * prevent overflow - maxkeys only has 16 bit! 
     */
    maxkeys=btree_calc_maxkeys(env_get_pagesize(db_get_env(db)), keysize);
    if (maxkeys>MAX_KEYS_PER_NODE) {
        ham_trace(("keysize/pagesize ratio too high"));
        return HAM_INV_KEYSIZE;
    }
    else if (maxkeys==0) {
        ham_trace(("keysize too large for the current pagesize"));
        return HAM_INV_KEYSIZE;
    }

    /*
     * allocate a new root page
     */
    st=db_alloc_page(&root, db, PAGE_TYPE_B_ROOT, PAGE_IGNORE_FREELIST);
    ham_assert(st ? root == NULL : 1, (0));
    ham_assert(!st ? root != NULL : 1, (0));
    if (!root)
        return st ? st : HAM_INTERNAL_ERROR;

    memset(page_get_raw_payload(root), 0, 
            sizeof(btree_node_t)+sizeof(ham_perm_page_union_t));

    /*
     * calculate the maximum number of keys for this page, 
     * and make sure that this number is even
     */
    btree_set_maxkeys(be, (ham_u16_t)maxkeys);
    be_set_dirty(be, HAM_TRUE);
    be_set_keysize(be, keysize);
    be_set_flags(be, flags);

    btree_set_rootpage(be, page_get_self(root));

    index_clear_reserved(indexdata);
    index_set_max_keys(indexdata, (ham_u16_t)maxkeys);
    index_set_keysize(indexdata, keysize);
    index_set_self(indexdata, page_get_self(root));
    index_set_flags(indexdata, flags);
    index_set_recno(indexdata, 0);
    index_clear_reserved(indexdata);

    env_set_dirty(db_get_env(db));

    be_set_active(be, HAM_TRUE);

    return (0);
}