/*
BIG FAT WARNING:

This routine should NEVER be used like this:

  ham_txn_t txn;
  txn_begin(&txn, env, 0);
  ...
  txn_commit/abort(&txn);

in any (C/C++) environment where the code in the '...' may trigger out of band jumps, such as longjmp()
to an outer layer or a C++ exception, as the transaction 'txn' will be bound to the 'db' structure
internally and cause a CORE DUMP once the 'db' structure is closed (and cleaned up) as then, in the
outer layer exception handler, the 'txn' stack space will have been NUKED.

This shortcutting style of coding was used throughout the unittests and it was waiting for the axe to fall...

It is also used within the hamsterdb C code itself, which is perfectly fine as this library does not
call any exception throwing code... UNLESS OF COURSE such sort of code is to be found in ANY of the
registered hooks/callbacks!

Hence any callbacks which get registered with hamsterDB should NEVER allow any C longjmp() or C++ exception
to pass /through/ the hamsterdb layer itself, or a core dump at ham_close/ham_env_close invocation
will be your share.
*/
ham_status_t
txn_begin(ham_txn_t *txn, ham_env_t *env, ham_u32_t flags)
{
    ham_status_t st=0;

    /* for hamsterdb 1.0.4 - only support one transaction */
    if (env_get_txn(env)) {
        ham_trace(("only one concurrent transaction is supported"));
        return (HAM_LIMITS_REACHED);
    }

    memset(txn, 0, sizeof(*txn));
    txn_set_env(txn, env);
    txn_set_id(txn, env_get_txn_id(env)+1);
    txn_set_flags(txn, flags);
    env_set_txn(env, txn);
    env_set_txn_id(env, txn_get_id(txn));

    if (env_get_log(env) && !(flags&HAM_TXN_READ_ONLY))
        st=ham_log_append_txn_begin(env_get_log(env), txn);

    return st;
}
Example #2
0
/**
 * write a series of data chunks to storage at file offset 'addr'.
 * 
 * The chunks are assumed to be stored in sequential order, adjacent
 * to each other, i.e. as one long data strip.
 * 
 * Writing is performed on a per-page basis, where special conditions
 * will decide whether or not the write operation is performed
 * through the page cache or directly to device; such is determined 
 * on a per-page basis.
 */
static ham_status_t
__write_chunks(ham_env_t *env, ham_page_t *page, ham_offset_t addr, 
        ham_bool_t allocated, ham_bool_t freshly_created, 
        ham_u8_t **chunk_data, ham_size_t *chunk_size, 
        ham_size_t chunks)
{
    ham_size_t i;
    ham_status_t st;
    ham_offset_t pageid;
    ham_device_t *device=env_get_device(env);
	ham_size_t pagesize = env_get_pagesize(env);

    ham_assert(freshly_created ? allocated : 1, (0));

    /*
     * for each chunk...
     */
    for (i=0; i<chunks; i++) {
        while (chunk_size[i]) {
            /*
             * get the page-ID from this chunk
             */
            pageid = addr - (addr % pagesize);

            /*
             * is this the current page?
             */
            if (page && page_get_self(page)!=pageid)
                page=0;

            /*
             * fetch the page from the cache, if it's in the cache
             * (unless we're logging - in this case always go through
             * the buffered routines)
             */
            if (!page) {
                /*
                 * keep pages in cache when they are located at the 'edges' of 
                 * the blob, as they MAY be accessed for different data.
                 * Of course, when a blob is small, there's only one (partial) 
                 * page accessed anyhow, so that one should end up in cache 
                 * then.
                 *
                 * When transaction logging is turned on, it's the same story, 
                 * really. We _could_ keep all those pages in cache now,
                 * but this would be thrashing the cache with blob data that's 
                 * accessed once only and for transaction abort (or commit)
                 * the amount of effort does not change.
                 *
                 * THOUGHT:
                 *
                 * Do we actually care what was in that page, which is going 
                 * to be overwritten in its entirety, BEFORE we do this, i.e. 
                 * before the transaction? 
                 *
                 * Answer: NO (and YES in special circumstances).
                 *
                 * Elaboration: As this would have been free space before, the 
                 * actual content does not matter, so it's not required to add
                 * the FULL pages written by the blob write action here to the 
                 * transaction log: even on transaction abort, that lingering 
                 * data is marked as 'bogus'/free as it was before anyhow.
                 *
                 * And then, assuming a longer running transaction, where this 
                 * page was freed during a previous action WITHIN
                 * the transaction, well, than the transaction log should 
                 * already carry this page's previous content as instructed 
                 * by the erase operation. HOWEVER, the erase operation would 
                 * not have a particular NEED to edit this page, as an erase op 
                 * is complete by just marking this space as free in the 
                 * freelist, resulting in the freelist pages (and the btree 
                 * pages) being the only ones being edited and ending up in 
                 * the transaction log then.
                 *
                 * Which means we'll have to log the previous content of these 
                 * pages to the transaction log anyhow. UNLESS, that is, when
                 * WE allocated these pages in the first place: then there 
                 * cannot be any 'pre-transaction' state of these pages 
                 * except that of 'not existing', i.e. 'free'. In which case, 
                 * their actual content doesn't matter! (freshly_created)
                 *
                 * And what if we have recovery logging turned on, but it's 
                 * not about an active transaction here?
                 * In that case, the recovery log would only log the OLD page 
                 * content, which we've concluded is insignificant, ever. Of 
                 * course, that's assuming (again!) that we're writing to 
                 * freshly created pages, which no-one has seen before. 
                 *
                 * Just as long as we can prevent this section from thrashing 
                 * the page cache, thank you very much...
                 */
                ham_bool_t at_blob_edge = (__blob_from_cache(env, chunk_size[i])
                        || (addr % pagesize) != 0 
                        || chunk_size[i] < pagesize);
                ham_bool_t cacheonly = (!at_blob_edge 
                                    && (!env_get_log(env)
                                        || freshly_created));
				//ham_assert(db_get_txn(db) ? !!env_get_log(env) : 1, (0));

                st=env_fetch_page(&page, env, pageid, 
                        cacheonly ? DB_ONLY_FROM_CACHE : 
                        at_blob_edge ? 0 : DB_NEW_PAGE_DOES_THRASH_CACHE);
				ham_assert(st ? !page : 1, (0));
                /* blob pages don't have a page header */
                if (page)
                {
                    page_set_npers_flags(page, 
                        page_get_npers_flags(page)|PAGE_NPERS_NO_HEADER);
                    /* if this page was recently allocated by the parent
                     * function: set a flag */
                    if (cacheonly 
                            && allocated 
                            && addr==page_get_self(page) 
                            && env_get_txn(env))
                        page_set_alloc_txn_id(page, txn_get_id(env_get_txn(env)));
                }
                else if (st) {
                    return st;
                }
            }

            /*
             * if we have a page pointer: use it; otherwise write directly
             * to the device
             */
            if (page) {
                ham_size_t writestart=
                        (ham_size_t)(addr-page_get_self(page));
                ham_size_t writesize =
                        (ham_size_t)(pagesize - writestart);
                if (writesize>chunk_size[i])
                    writesize=chunk_size[i];
                if ((st=ham_log_add_page_before(page)))
                    return (st);
                memcpy(&page_get_raw_payload(page)[writestart], chunk_data[i],
                            writesize);
                page_set_dirty(page, env);
                addr+=writesize;
                chunk_data[i]+=writesize;
                chunk_size[i]-=writesize;
            }
            else {
                ham_size_t s = chunk_size[i];
                /* limit to the next page boundary */
                if (s > pageid+pagesize-addr)
                    s = (ham_size_t)(pageid+pagesize-addr);

                ham_assert(env_get_log(env) ? freshly_created : 1, (0));

                st=device->write(device, addr, chunk_data[i], s);
                if (st)
                    return st;
                addr+=s;
                chunk_data[i]+=s;
                chunk_size[i]-=s;
            }
        }
    }

    return (0);
}
ham_status_t
txn_abort(ham_txn_t *txn, ham_u32_t flags)
{
    ham_status_t st;
    ham_env_t *env=txn_get_env(txn);

    /*
     * are cursors attached to this txn? if yes, fail
     */
    if (txn_get_cursor_refcount(txn)) {
        ham_trace(("transaction cannot be aborted till all attached "
                    "cursors are closed"));
        return HAM_CURSOR_STILL_OPEN;
    }

    if (env_get_log(env) && !(txn_get_flags(txn)&HAM_TXN_READ_ONLY)) {
        st=ham_log_append_txn_abort(env_get_log(env), txn);
        if (st) 
            return st;
    }

    env_set_txn(env, 0);

    /*
     * undo all operations from this transaction
     * 
     * this includes allocated pages (they're moved to the freelist), 
     * deleted pages (they're un-deleted) and other modifications (will
     * re-create the original page from the logfile)
     *
     * keep txn_get_pagelist(txn) intact during every round, so no 
     * local var for this one.
     */
    while (txn_get_pagelist(txn)) {
        ham_page_t *head = txn_get_pagelist(txn);

        if (!(flags & DO_NOT_NUKE_PAGE_STATS)) {
            /* 
             * nuke critical statistics, such as tracked outer bounds; imagine,
             * for example, a failing erase transaction which, through erasing 
             * the top-most key, lowers the actual upper bound, after which 
             * the transaction fails at some later point in life. Now if we 
             * wouldn't 'rewind' our bounds-statistics, we would have a 
             * situation where a subsequent out-of-bounds insert (~ append) 
             * would possibly FAIL due to the hinter using incorrect bounds 
             * information then!
             *
             * Hence we 'reverse' our statistics here and the easiest route 
             * is to just nuke the critical bits; subsequent find/insert/erase 
             * operations will ensure that the stats will get updated again, 
             * anyhow. All we loose then is a few subsequent operations, which 
             * might have been hinted if we had played a smarter game of 
             * statistics 'reversal'. Soit.
             */
			ham_db_t *db = page_get_owner(head);

			/*
			 * only need to do this for index pages anyhow, and those are the 
             * ones which have their 'ownership' set.
			 */
			if (db) {
				stats_page_is_nuked(db, head, HAM_FALSE); 
			}
        }

        ham_assert(page_is_in_list(txn_get_pagelist(txn), head, PAGE_LIST_TXN),
                             (0));
        txn_get_pagelist(txn) = page_list_remove(head, PAGE_LIST_TXN, head);

        /* if this page was allocated by this transaction, then we can
         * move the whole page to the freelist */
        if (page_get_alloc_txn_id(head)==txn_get_id(txn)) {
            (void)freel_mark_free(env, 0, page_get_self(head), 
                    env_get_pagesize(env), HAM_TRUE);
        }
        else {
            /* remove the 'delete pending' flag */
            page_set_npers_flags(head, 
                    page_get_npers_flags(head)&~PAGE_NPERS_DELETE_PENDING);

            /* if the page is dirty, and RECOVERY is enabled: recreate
             * the original, unmodified page from the log */
            if (env_get_log(env) && page_is_dirty(head)) {
                st=ham_log_recreate(env_get_log(env), head);
                if (st)
                    return (st);
                /*page_set_undirty(head); */
            }
        }

        /* page is no longer in use */
        page_release_ref(head);
    }

    ham_assert(txn_get_pagelist(txn)==0, (0));

    return (0);
}
ham_status_t
txn_commit(ham_txn_t *txn, ham_u32_t flags)
{
    ham_status_t st;
    ham_env_t *env=txn_get_env(txn);

    /*
     * are cursors attached to this txn? if yes, fail
     */
    if (txn_get_cursor_refcount(txn)) {
        ham_trace(("transaction cannot be committed till all attached "
                    "cursors are closed"));
        return HAM_CURSOR_STILL_OPEN;
    }

    /*
     * in case of logging: write after-images of all modified pages,
     * if they were modified by this transaction;
     * then write the transaction boundary
     */
    if (env_get_log(env) && !(txn_get_flags(txn)&HAM_TXN_READ_ONLY)) 
    {
        ham_page_t *head=txn_get_pagelist(txn);
        while (head) {
            ham_page_t *next;

            next=page_get_next(head, PAGE_LIST_TXN);
            if (page_get_dirty_txn(head)==txn_get_id(txn) 
                    || page_get_dirty_txn(head)==PAGE_DUMMY_TXN_ID) {
                st=ham_log_add_page_after(head);
                if (st) 
                    return st;
            }
            head=next;
        }

        st=ham_log_append_txn_commit(env_get_log(env), txn);
        if (st) 
            return st;
    }

    env_set_txn(env, 0);

    /*
     * flush the pages
     *
     * shouldn't use local var for the list head, as
     * txn_get_pagelist(txn) should be kept up to date and correctly
     * formatted while we call db_free_page() et al.
     */
    while (txn_get_pagelist(txn))
    {
        ham_page_t *head = txn_get_pagelist(txn);
        
        txn_get_pagelist(txn) = page_list_remove(head, PAGE_LIST_TXN, head);

        /* page is no longer in use */
        page_release_ref(head);

        /* 
         * delete the page? 
         */
        if (page_get_npers_flags(head)&PAGE_NPERS_DELETE_PENDING) {
            /* remove page from cache, add it to garbage list */
            page_set_undirty(head);
        
            st=db_free_page(head, DB_MOVE_TO_FREELIST);
            if (st)
                return (st);
        }
        else if (flags & HAM_TXN_FORCE_WRITE) {
            /* flush the page */
            st=db_flush_page(env, head, 
                    flags & HAM_TXN_FORCE_WRITE ? HAM_WRITE_THROUGH : 0);
            if (st) {
                page_add_ref(head);
                /* failure: re-insert into transaction list! */
                txn_get_pagelist(txn) = page_list_insert(txn_get_pagelist(txn),
                            PAGE_LIST_TXN, head);
                return (st);
            }
        }
    }

    txn_set_pagelist(txn, 0);

    return HAM_SUCCESS;
}