コード例 #1
0
ham_status_t
txn_free_page(ham_txn_t *txn, ham_page_t *page)
{
    ham_assert(!(page_get_npers_flags(page)&PAGE_NPERS_DELETE_PENDING), (0));
    ham_assert(page_get_cursors(page)==0, (0));

    page_set_npers_flags(page,
            page_get_npers_flags(page)|PAGE_NPERS_DELETE_PENDING);

    return (HAM_SUCCESS);
}
コード例 #2
0
ファイル: blob.c プロジェクト: bawerd/hamsterdb
/**
 * write a series of data chunks to storage at file offset 'addr'.
 * 
 * The chunks are assumed to be stored in sequential order, adjacent
 * to each other, i.e. as one long data strip.
 * 
 * Writing is performed on a per-page basis, where special conditions
 * will decide whether or not the write operation is performed
 * through the page cache or directly to device; such is determined 
 * on a per-page basis.
 */
static ham_status_t
__write_chunks(ham_env_t *env, ham_page_t *page, ham_offset_t addr, 
        ham_bool_t allocated, ham_bool_t freshly_created, 
        ham_u8_t **chunk_data, ham_size_t *chunk_size, 
        ham_size_t chunks)
{
    ham_size_t i;
    ham_status_t st;
    ham_offset_t pageid;
    ham_device_t *device=env_get_device(env);
	ham_size_t pagesize = env_get_pagesize(env);

    ham_assert(freshly_created ? allocated : 1, (0));

    /*
     * for each chunk...
     */
    for (i=0; i<chunks; i++) {
        while (chunk_size[i]) {
            /*
             * get the page-ID from this chunk
             */
            pageid = addr - (addr % pagesize);

            /*
             * is this the current page?
             */
            if (page && page_get_self(page)!=pageid)
                page=0;

            /*
             * fetch the page from the cache, if it's in the cache
             * (unless we're logging - in this case always go through
             * the buffered routines)
             */
            if (!page) {
                /*
                 * keep pages in cache when they are located at the 'edges' of 
                 * the blob, as they MAY be accessed for different data.
                 * Of course, when a blob is small, there's only one (partial) 
                 * page accessed anyhow, so that one should end up in cache 
                 * then.
                 *
                 * When transaction logging is turned on, it's the same story, 
                 * really. We _could_ keep all those pages in cache now,
                 * but this would be thrashing the cache with blob data that's 
                 * accessed once only and for transaction abort (or commit)
                 * the amount of effort does not change.
                 *
                 * THOUGHT:
                 *
                 * Do we actually care what was in that page, which is going 
                 * to be overwritten in its entirety, BEFORE we do this, i.e. 
                 * before the transaction? 
                 *
                 * Answer: NO (and YES in special circumstances).
                 *
                 * Elaboration: As this would have been free space before, the 
                 * actual content does not matter, so it's not required to add
                 * the FULL pages written by the blob write action here to the 
                 * transaction log: even on transaction abort, that lingering 
                 * data is marked as 'bogus'/free as it was before anyhow.
                 *
                 * And then, assuming a longer running transaction, where this 
                 * page was freed during a previous action WITHIN
                 * the transaction, well, than the transaction log should 
                 * already carry this page's previous content as instructed 
                 * by the erase operation. HOWEVER, the erase operation would 
                 * not have a particular NEED to edit this page, as an erase op 
                 * is complete by just marking this space as free in the 
                 * freelist, resulting in the freelist pages (and the btree 
                 * pages) being the only ones being edited and ending up in 
                 * the transaction log then.
                 *
                 * Which means we'll have to log the previous content of these 
                 * pages to the transaction log anyhow. UNLESS, that is, when
                 * WE allocated these pages in the first place: then there 
                 * cannot be any 'pre-transaction' state of these pages 
                 * except that of 'not existing', i.e. 'free'. In which case, 
                 * their actual content doesn't matter! (freshly_created)
                 *
                 * And what if we have recovery logging turned on, but it's 
                 * not about an active transaction here?
                 * In that case, the recovery log would only log the OLD page 
                 * content, which we've concluded is insignificant, ever. Of 
                 * course, that's assuming (again!) that we're writing to 
                 * freshly created pages, which no-one has seen before. 
                 *
                 * Just as long as we can prevent this section from thrashing 
                 * the page cache, thank you very much...
                 */
                ham_bool_t at_blob_edge = (__blob_from_cache(env, chunk_size[i])
                        || (addr % pagesize) != 0 
                        || chunk_size[i] < pagesize);
                ham_bool_t cacheonly = (!at_blob_edge 
                                    && (!env_get_log(env)
                                        || freshly_created));
				//ham_assert(db_get_txn(db) ? !!env_get_log(env) : 1, (0));

                st=env_fetch_page(&page, env, pageid, 
                        cacheonly ? DB_ONLY_FROM_CACHE : 
                        at_blob_edge ? 0 : DB_NEW_PAGE_DOES_THRASH_CACHE);
				ham_assert(st ? !page : 1, (0));
                /* blob pages don't have a page header */
                if (page)
                {
                    page_set_npers_flags(page, 
                        page_get_npers_flags(page)|PAGE_NPERS_NO_HEADER);
                    /* if this page was recently allocated by the parent
                     * function: set a flag */
                    if (cacheonly 
                            && allocated 
                            && addr==page_get_self(page) 
                            && env_get_txn(env))
                        page_set_alloc_txn_id(page, txn_get_id(env_get_txn(env)));
                }
                else if (st) {
                    return st;
                }
            }

            /*
             * if we have a page pointer: use it; otherwise write directly
             * to the device
             */
            if (page) {
                ham_size_t writestart=
                        (ham_size_t)(addr-page_get_self(page));
                ham_size_t writesize =
                        (ham_size_t)(pagesize - writestart);
                if (writesize>chunk_size[i])
                    writesize=chunk_size[i];
                if ((st=ham_log_add_page_before(page)))
                    return (st);
                memcpy(&page_get_raw_payload(page)[writestart], chunk_data[i],
                            writesize);
                page_set_dirty(page, env);
                addr+=writesize;
                chunk_data[i]+=writesize;
                chunk_size[i]-=writesize;
            }
            else {
                ham_size_t s = chunk_size[i];
                /* limit to the next page boundary */
                if (s > pageid+pagesize-addr)
                    s = (ham_size_t)(pageid+pagesize-addr);

                ham_assert(env_get_log(env) ? freshly_created : 1, (0));

                st=device->write(device, addr, chunk_data[i], s);
                if (st)
                    return st;
                addr+=s;
                chunk_data[i]+=s;
                chunk_size[i]-=s;
            }
        }
    }

    return (0);
}
コード例 #3
0
ファイル: blob.c プロジェクト: bawerd/hamsterdb
/**
 * Allocate space in storage for and write the content references by 'data'
 * (and length 'size') to storage.
 * 
 * Conditions will apply whether the data is written through cache or direct
 * to device.
 * 
 * The content is, of course, prefixed by a BLOB header.
 * 
 * Partial writes are handled in this function.
 */
ham_status_t
blob_allocate(ham_env_t *env, ham_db_t *db, ham_record_t *record,
        ham_u32_t flags, ham_offset_t *blobid)
{
    ham_status_t st;
    ham_page_t *page=0;
    ham_offset_t addr;
    blob_t hdr;
    ham_u8_t *chunk_data[2];
    ham_size_t alloc_size;
    ham_size_t chunk_size[2];
    ham_device_t *device=env_get_device(env);
    ham_bool_t freshly_created = HAM_FALSE;
   
    *blobid=0;

    /*
     * PARTIAL WRITE
     * 
     * if offset+partial_size equals the full record size, then we won't 
     * have any gaps. In this case we just write the full record and ignore
     * the partial parameters.
     */
    if (flags&HAM_PARTIAL) {
        if (record->partial_offset==0 
                && record->partial_offset+record->partial_size==record->size)
            flags&=~HAM_PARTIAL;
    }

    /*
     * in-memory-database: the blobid is actually a pointer to the memory
     * buffer, in which the blob (with the blob-header) is stored
     */
    if (env_get_rt_flags(env)&HAM_IN_MEMORY_DB) {
        blob_t *hdr;
        ham_u8_t *p=(ham_u8_t *)allocator_alloc(env_get_allocator(env), 
                                    record->size+sizeof(blob_t));
        if (!p) {
            return HAM_OUT_OF_MEMORY;
        }

        /* initialize the header */
        hdr=(blob_t *)p;
        memset(hdr, 0, sizeof(*hdr));
        blob_set_self(hdr, (ham_offset_t)PTR_TO_U64(p));
        blob_set_alloc_size(hdr, record->size+sizeof(blob_t));
        blob_set_size(hdr, record->size);

        /* do we have gaps? if yes, fill them with zeroes */
        if (flags&HAM_PARTIAL) {
            ham_u8_t *s=p+sizeof(blob_t);
            if (record->partial_offset)
                memset(s, 0, record->partial_offset);
            memcpy(s+record->partial_offset,
                    record->data, record->partial_size);
            if (record->partial_offset+record->partial_size<record->size)
                memset(s+record->partial_offset+record->partial_size, 0, 
                    record->size-(record->partial_offset+record->partial_size));
        }
        else {
            memcpy(p+sizeof(blob_t), record->data, record->size);
        }

        *blobid=(ham_offset_t)PTR_TO_U64(p);
        return (0);
    }

    memset(&hdr, 0, sizeof(hdr));

    /*
     * blobs are CHUNKSIZE-allocated 
     */
    alloc_size=sizeof(blob_t)+record->size;
    alloc_size += DB_CHUNKSIZE - 1;
    alloc_size -= alloc_size % DB_CHUNKSIZE;

    /* 
     * check if we have space in the freelist 
     */
    st = freel_alloc_area(&addr, env, db, alloc_size);
    if (!addr) 
    {
        if (st)
            return st;

        /*
         * if the blob is small AND if logging is disabled: load the page 
         * through the cache
         */
        if (__blob_from_cache(env, alloc_size)) {
            st = db_alloc_page(&page, db, PAGE_TYPE_BLOB, 
                        PAGE_IGNORE_FREELIST);
			ham_assert(st ? page == NULL : 1, (0));
			ham_assert(!st ? page  != NULL : 1, (0));
            if (st)
                return st;
            /* blob pages don't have a page header */
            page_set_npers_flags(page, 
                    page_get_npers_flags(page)|PAGE_NPERS_NO_HEADER);
            addr=page_get_self(page);
            /* move the remaining space to the freelist */
            (void)freel_mark_free(env, db, addr+alloc_size,
                    env_get_pagesize(env)-alloc_size, HAM_FALSE);
            blob_set_alloc_size(&hdr, alloc_size);
        }
        else {
            /*
             * otherwise use direct IO to allocate the space
             */
            ham_size_t aligned=alloc_size;
            aligned += env_get_pagesize(env) - 1;
            aligned -= aligned % env_get_pagesize(env);

            st=device->alloc(device, aligned, &addr);
            if (st) 
                return (st);

            /* if aligned!=size, and the remaining chunk is large enough:
             * move it to the freelist */
            {
                ham_size_t diff=aligned-alloc_size;
                if (diff > SMALLEST_CHUNK_SIZE) {
                    (void)freel_mark_free(env, db, addr+alloc_size, 
                            diff, HAM_FALSE);
                    blob_set_alloc_size(&hdr, aligned-diff);
                }
                else {
                    blob_set_alloc_size(&hdr, aligned);
                }
            }
            freshly_created = HAM_TRUE;
        }

        ham_assert(HAM_SUCCESS == freel_check_area_is_allocated(env, db,
                    addr, alloc_size), (0));
    }
    else {
		ham_assert(!st, (0));
        blob_set_alloc_size(&hdr, alloc_size);
    }

    blob_set_size(&hdr, record->size);
    blob_set_self(&hdr, addr);

    /*
     * PARTIAL WRITE
     *
     * are there gaps at the beginning? If yes, then we'll fill with zeros
     */
    if ((flags&HAM_PARTIAL) && (record->partial_offset)) {
        ham_u8_t *ptr;
        ham_size_t gapsize=record->partial_offset;

        ptr=allocator_calloc(env_get_allocator(env), 
                                    gapsize > env_get_pagesize(env)
                                        ? env_get_pagesize(env)
                                        : gapsize);
        if (!ptr)
            return (HAM_OUT_OF_MEMORY);

        /* 
         * first: write the header
         */
        chunk_data[0]=(ham_u8_t *)&hdr;
        chunk_size[0]=sizeof(hdr);
        st=__write_chunks(env, page, addr, HAM_TRUE, freshly_created, 
                        chunk_data, chunk_size, 1);
        if (st)
            return (st);

        addr+=sizeof(hdr);

        /* now fill the gap; if the gap is bigger than a pagesize we'll
         * split the gap into smaller chunks 
         */
        while (gapsize>=env_get_pagesize(env)) {
            chunk_data[0]=ptr;
            chunk_size[0]=env_get_pagesize(env);
            st=__write_chunks(env, page, addr, HAM_TRUE, 
                    freshly_created, chunk_data, chunk_size, 1);
            if (st)
                break;
            gapsize-=env_get_pagesize(env);
            addr+=env_get_pagesize(env);
        }

        /* fill the remaining gap */
        if (gapsize) {
            chunk_data[0]=ptr;
            chunk_size[0]=gapsize;

            st=__write_chunks(env, page, addr, HAM_TRUE, freshly_created, 
                            chunk_data, chunk_size, 1);
            if (st)
                return (st);
            addr+=gapsize;
        }

        allocator_free(env_get_allocator(env), ptr);

        /* now write the "real" data */
        chunk_data[0]=(ham_u8_t *)record->data;
        chunk_size[0]=record->partial_size;

        st=__write_chunks(env, page, addr, HAM_TRUE, freshly_created, 
                        chunk_data, chunk_size, 1);
        if (st)
            return (st);
        addr+=record->partial_size;
    }
    else {
        /* 
         * not writing partially: write header and data, then we're done
         */
        chunk_data[0]=(ham_u8_t *)&hdr;
        chunk_size[0]=sizeof(hdr);
        chunk_data[1]=(ham_u8_t *)record->data;
        chunk_size[1]=(flags&HAM_PARTIAL) 
                        ? record->partial_size 
                        : record->size;

        st=__write_chunks(env, page, addr, HAM_TRUE, freshly_created, 
                        chunk_data, chunk_size, 2);
        if (st)
            return (st);
        addr+=sizeof(hdr)+
            ((flags&HAM_PARTIAL) ? record->partial_size : record->size);
    }

    /*
     * store the blobid; it will be returned to the caller
     */
    *blobid=blob_get_self(&hdr);

    /*
     * PARTIAL WRITES:
     *
     * if we have gaps at the end of the blob: just append more chunks to
     * fill these gaps. Since they can be pretty large we split them into
     * smaller chunks if necessary.
     */
    if (flags&HAM_PARTIAL) {
        if (record->partial_offset+record->partial_size < record->size) {
            ham_u8_t *ptr;
            ham_size_t gapsize=record->size
                            - (record->partial_offset+record->partial_size);

            /* now fill the gap; if the gap is bigger than a pagesize we'll
             * split the gap into smaller chunks 
             *
             * we split this loop in two - the outer loop will allocate the
             * memory buffer, thus saving some allocations
             */
            while (gapsize>env_get_pagesize(env)) {
                ham_u8_t *ptr=allocator_calloc(env_get_allocator(env), 
                                            env_get_pagesize(env));
                if (!ptr)
                    return (HAM_OUT_OF_MEMORY);
                while (gapsize>env_get_pagesize(env)) {
                    chunk_data[0]=ptr;
                    chunk_size[0]=env_get_pagesize(env);
                    st=__write_chunks(env, page, addr, HAM_TRUE, 
                            freshly_created, chunk_data, chunk_size, 1);
                    if (st)
                        break;
                    gapsize-=env_get_pagesize(env);
                    addr+=env_get_pagesize(env);
                }
                allocator_free(env_get_allocator(env), ptr);
                if (st)
                    return (st);
            }
            
            /* now write the remainder, which is less than a pagesize */
            ham_assert(gapsize<env_get_pagesize(env), (""));

            chunk_size[0]=gapsize;
            ptr=chunk_data[0]=allocator_calloc(env_get_allocator(env), gapsize);
            if (!ptr)
                return (HAM_OUT_OF_MEMORY);

            st=__write_chunks(env, page, addr, HAM_TRUE, freshly_created, 
                        chunk_data, chunk_size, 1);
            allocator_free(env_get_allocator(env), ptr);
            if (st)
                return (st);
        }
    }

    return (0);
}
コード例 #4
0
ファイル: blob.c プロジェクト: bawerd/hamsterdb
static ham_status_t
__read_chunk(ham_env_t *env, ham_page_t *page, ham_page_t **fpage, 
        ham_offset_t addr, ham_u8_t *data, ham_size_t size)
{
    ham_status_t st;
    ham_device_t *device=env_get_device(env);

    while (size) {
        /*
         * get the page-ID from this chunk
         */
        ham_offset_t pageid;
		pageid = addr - (addr % env_get_pagesize(env));

        if (page) {
            if (page_get_self(page)!=pageid)
                page=0;
        }

        /*
         * is it the current page? if not, try to fetch the page from
         * the cache - but only read the page from disk, if the 
         * chunk is small
         */
        if (!page) {
            st=env_fetch_page(&page, env, pageid, 
                    __blob_from_cache(env, size) ? 0 : DB_ONLY_FROM_CACHE);
			ham_assert(st ? !page : 1, (0));
            /* blob pages don't have a page header */
            if (page)
                page_set_npers_flags(page, 
                    page_get_npers_flags(page)|PAGE_NPERS_NO_HEADER);
			else if (st)
				return st;
        }

        /*
         * if we have a page pointer: use it; otherwise read directly
         * from the device
         */
        if (page) {
            ham_size_t readstart=
                    (ham_size_t)(addr-page_get_self(page));
            ham_size_t readsize =
                    (ham_size_t)(env_get_pagesize(env)-readstart);
            if (readsize>size)
                readsize=size;
            memcpy(data, &page_get_raw_payload(page)[readstart], readsize);
            addr+=readsize;
            data+=readsize;
            size-=readsize;
        }
        else {
            ham_size_t s=(size<env_get_pagesize(env) 
                    ? size : env_get_pagesize(env));
            /* limit to the next page boundary */
            if (s>pageid+env_get_pagesize(env)-addr)
                s=(ham_size_t)(pageid+env_get_pagesize(env)-addr);

            st=device->read(device, addr, data, s);
            if (st) 
                return st;
            addr+=s;
            data+=s;
            size-=s;
        }
    }

    if (fpage)
        *fpage=page;

    return (0);
}
コード例 #5
0
ham_status_t
txn_abort(ham_txn_t *txn, ham_u32_t flags)
{
    ham_status_t st;
    ham_env_t *env=txn_get_env(txn);

    /*
     * are cursors attached to this txn? if yes, fail
     */
    if (txn_get_cursor_refcount(txn)) {
        ham_trace(("transaction cannot be aborted till all attached "
                    "cursors are closed"));
        return HAM_CURSOR_STILL_OPEN;
    }

    if (env_get_log(env) && !(txn_get_flags(txn)&HAM_TXN_READ_ONLY)) {
        st=ham_log_append_txn_abort(env_get_log(env), txn);
        if (st) 
            return st;
    }

    env_set_txn(env, 0);

    /*
     * undo all operations from this transaction
     * 
     * this includes allocated pages (they're moved to the freelist), 
     * deleted pages (they're un-deleted) and other modifications (will
     * re-create the original page from the logfile)
     *
     * keep txn_get_pagelist(txn) intact during every round, so no 
     * local var for this one.
     */
    while (txn_get_pagelist(txn)) {
        ham_page_t *head = txn_get_pagelist(txn);

        if (!(flags & DO_NOT_NUKE_PAGE_STATS)) {
            /* 
             * nuke critical statistics, such as tracked outer bounds; imagine,
             * for example, a failing erase transaction which, through erasing 
             * the top-most key, lowers the actual upper bound, after which 
             * the transaction fails at some later point in life. Now if we 
             * wouldn't 'rewind' our bounds-statistics, we would have a 
             * situation where a subsequent out-of-bounds insert (~ append) 
             * would possibly FAIL due to the hinter using incorrect bounds 
             * information then!
             *
             * Hence we 'reverse' our statistics here and the easiest route 
             * is to just nuke the critical bits; subsequent find/insert/erase 
             * operations will ensure that the stats will get updated again, 
             * anyhow. All we loose then is a few subsequent operations, which 
             * might have been hinted if we had played a smarter game of 
             * statistics 'reversal'. Soit.
             */
			ham_db_t *db = page_get_owner(head);

			/*
			 * only need to do this for index pages anyhow, and those are the 
             * ones which have their 'ownership' set.
			 */
			if (db) {
				stats_page_is_nuked(db, head, HAM_FALSE); 
			}
        }

        ham_assert(page_is_in_list(txn_get_pagelist(txn), head, PAGE_LIST_TXN),
                             (0));
        txn_get_pagelist(txn) = page_list_remove(head, PAGE_LIST_TXN, head);

        /* if this page was allocated by this transaction, then we can
         * move the whole page to the freelist */
        if (page_get_alloc_txn_id(head)==txn_get_id(txn)) {
            (void)freel_mark_free(env, 0, page_get_self(head), 
                    env_get_pagesize(env), HAM_TRUE);
        }
        else {
            /* remove the 'delete pending' flag */
            page_set_npers_flags(head, 
                    page_get_npers_flags(head)&~PAGE_NPERS_DELETE_PENDING);

            /* if the page is dirty, and RECOVERY is enabled: recreate
             * the original, unmodified page from the log */
            if (env_get_log(env) && page_is_dirty(head)) {
                st=ham_log_recreate(env_get_log(env), head);
                if (st)
                    return (st);
                /*page_set_undirty(head); */
            }
        }

        /* page is no longer in use */
        page_release_ref(head);
    }

    ham_assert(txn_get_pagelist(txn)==0, (0));

    return (0);
}